AI Recommendation System data crawler basedline

parent 9bc3bdb3
/classification/classification_category_raw_data/
/classification/classification_data/
/classification/fasttext_data/
/junk/
/readme.md
.idea
\ No newline at end of file
stages:
- deploy
production:
stage: deploy
tags:
- python-backend-prod
environment: PROD
only:
- master
script:
- whoami
- pwd
- cp config/prod/*.py config
- mkdir -p /datadrive/deepcare/crawler || true
- sudo cp -r * /datadrive/deepcare/crawler
- sudo cp /datadrive/deepcare/crawler/crawler.service /etc/systemd/system/
- sudo systemctl enable crawler.service
- sudo cp /datadrive/deepcare/crawler/crawler-api.service /etc/systemd/system/
- sudo systemctl enable crawler-api.service
- sudo cp /datadrive/deepcare/crawler/crawler-processors.service /etc/systemd/system/
- sudo systemctl enable crawler-processors.service
- sudo systemctl daemon-reload
- sudo systemctl restart crawler.service
- sudo systemctl restart crawler-api.service
- sudo systemctl restart crawler-processors.service
\ No newline at end of file
# recommendation-system-crawler
## Cài đặt môi trường python 3.6 và các thư viện:
```
pip install -r requirements.txt
```
## Thay đổi các tùy chọn cài đặt trong thư mục settings tương ứng với 2 môi trường dev và production trong 2 thư mục settings/dev_settings và settings/prod_settings
```
api_settings.py # địa chỉ và port dành cho api
general_settings.py # đường dẫn các file dữ liệu
kafka_settings.py # địa chỉ và cổng của kafka cùng tên các topics
mysql_settings.py # username, password, host, port để truy cập database với mysql (sử dụng port đã bind với local ở lệnh đầu tiên)
redis_settings.py # host, port, db của redis
```
### Đối với cài đặt của mysql trong mysql_settings:
#### dev_settings
```
# DEV ENV (thông tin mysql trên server dev)
USERNAME = 'root'
PASSWORD = '123456'
DATABASE = 'news_content_support'
HOST = '0.0.0.0'
PORT = 3333
# PROD ENV (thông tin mysql trên server production để kết nối đẩy tin crawl từ server dev lên server production)
USERNAME_PROD = ''
PASSWORD_PROD = ''
DATABASE_PROD = ''
HOST_PROD = ''
PORT_PROD = 3306
```
#### prod_settings
```
# thông tin mysql để thực hiện kết nối trên chính server production
USERNAME = ''
PASSWORD = ''
DATABASE = ''
HOST = ''
PORT = 3306
```
### Đối với cài đặt url của api để upload tin tức, thay đổi UPLOAD_NEWS_URL ở trong api_settings.py trong 2 thư mục config/dev, config/prod tương ứng 2 môi trường dev và production
```
UPLOAD_NEWS_URL = ''
```
### Thực hiện copy các cài đặt tương ứng từ prod/dev lên thư mục config
```
cp config/dev/* config/
or
cp config/prod/* config/
```
## Docker kafka:
### Đối với server dev, nếu docker chưa được chạy, truy cập thư mục news_content_support/lab/docker/kafka-docker (user trith), sử dụng lệnh:
```
# Chạy docker kafka
> sudo docker-compose up -d
# tạo topic add_news_topic
> docker exec -t kafka-docker_kafka_1 kafka-topics.sh --bootstrap-server :9092 --create --topic add_news_topic --partitions 3 --replication-factor 1
```
## Thực hiện deploy project
### Đối với môi trường dev:
```
python run_api.py # chạy backend cho tool edit tin tức
python run_processors.py # chạy consumer để xử lý các tin tức đã crawl từ kafka topic add_news_topic để đẩy vào mysql trên cả 2 server dev và production
python run.py # chạy crawler
```
API tạo user:
For development
DB information:
from flask import Blueprint, request, Response, jsonify
from sqlalchemy.orm.session import Session
from flask_sqlalchemy_session import current_session
from uuid import uuid4
from sqlalchemy import and_, not_
import requests
from treelib import Tree
import time
from utils.entity_utils import entity_to_dict
from mysql.entities import *
from config import api_settings
category_blueprint = Blueprint('category', __name__)
session: Session = current_session
@category_blueprint.route("/api/getCategories", methods=['GET'])
def get_categories():
res = [
category.to_dict()
for category in session.query(Category).filter(
not_(Category.deleted),
Category.parent_id.is_(None)
).all()
]
return jsonify(res)
# @category_blueprint.route('/api/addCategory', methods=['POST'])
# def add_category():
# key = request.headers.get('key')
# assert key == 'deepcare@2020'
#
# data = request.get_json()
#
# try:
# if type(data) == list:
# categories = [
# Category(
# category_id=category['category_id'],
# category_name=category['name'],
# parent_id=category['parent_id']
# )
# for category in data
# ]
#
# session.add_all(categories)
# session.commit()
# else:
# category = Category(
# category_id=data['category_id'],
# category_name=data['category_name'],
# parent_id=data['parent_id']
# )
#
# session.add(category)
# session.commit()
#
# return Response('Add category successfully!!!', status=500)
# except:
# session.rollback()
# return Response('Failed to add category!!!', status=500)
def get_branch(tree, node):
depth = tree.level(node)
if depth <= 1:
return None
else:
up_level = depth - 1
i = 0
while i < up_level:
node = tree.parent(node).identifier
i += 1
return tree[node]
@category_blueprint.route('/ex-api/addCategories', methods=['POST'])
def add_categories():
key = request.headers.get('key')
assert key == 'deepcare@2020'
data = request.get_json()
assert type(data) is list
created_category_ids = [
category.category_id
for category in session.query(Category).filter(
not_(Category.deleted),
).all()
]
tree = Tree()
tree.create_node(tag='Root', identifier='root')
for category in data:
tree.create_node(
tag=category['category_name'],
identifier=category['category_id'],
parent=category['parent_id'] if category['parent_id'] else 'root',
data={
'category_id': category['category_id'],
'category_name': category['category_name'],
'parent_id': category['parent_id'] if category['parent_id'] else None,
'type': category['type']
}
)
tree.show()
# for node in tree.expand_tree(mode=tree.DEPTH):
# data = tree[node].data
#
# if data and data['category_id'] not in created_category_ids:
# branch = get_branch(tree, node)
# res = requests.post(
# url='http://dev.deepcare.io:8181/api/v1/cms/news/addTopic',
# json={
# 'topic_name': data['category_name'],
# 'parent': data['parent_id'],
# 'level': tree.level(node),
# 'branch': branch.data['category_id'] if branch else None
# },
# headers={
# 'content-type': 'application/json',
# 'x-access-token': api_settings.SERVER_API_TOKEN
# }
# )
#
# res_data = res.json()
#
# if res_data.get('status') == 'OK':
# topic_id = res_data.get('topic_id')
# data['category_id'] = topic_id
# for child in tree.children(node):
# child.data['parent_id'] = topic_id
#
categories = []
for node in tree.expand_tree():
category = tree[node].data
if category and category['category_id'] not in created_category_ids:
categories.append(Category(**category))
try:
session.add_all(categories)
session.commit()
return jsonify({
'success': True,
'message': "Add categories successfully!!!"
})
except:
session.rollback()
return jsonify({
'success': False,
'message': "Fail to add categories!!!"
})
from flask import request, Response, jsonify, Blueprint
from sqlalchemy import desc, and_, or_, not_, func
from flask_sqlalchemy_session import current_session
from sqlalchemy.orm.session import Session
from uuid import uuid4
import time
from mysql.entities import *
from utils.entity_utils import entity_to_dict
censor_blueprint = Blueprint('censor_news', __name__)
session: Session = current_session
@censor_blueprint.route('/api/censor/censorNews', methods=['POST'])
def censor_news():
user_id = request.headers.get('token')
data = request.get_json()
mkt_edited_news_id = data.get('id')
status = data.get('status')
schedule_date = data.get('scheduleDate')
assert user_id is not None
assert mkt_edited_news_id is not None
assert status in [Status.approved.name, Status.ignored.name, Status.scheduled.name]
if status == Status.scheduled.name:
assert schedule_date is not None
user = session.query(User).filter(
User.user_id == user_id,
not_(User.deleted)
).first()
assert user is not None and user.role == Role.censor
mkt_edited_news = session.query(MarketingEditedNews).filter(
MarketingEditedNews.mkt_edited_news_id == mkt_edited_news_id,
not_(MarketingEditedNews.deleted),
not_(MarketingEditedNews.status.in_([
Status.approved.name, Status.ignored.name, Status.trash.name, Status.scheduled.name
]))
).first()
assert mkt_edited_news is not None
try:
mkt_edited_news.censor_user_id = user_id
mkt_edited_news.status = status
if status == Status.scheduled.name:
mkt_edited_news.schedule_date = float(schedule_date)
session.commit()
res = {
'success': True,
'message': f'{status} news successfully!!!'
}
except Exception as e:
print(e)
res = {
'success': False,
'message': f'{e}'
}
return jsonify(res)
This diff is collapsed.
from flask import Blueprint, request, Response, jsonify
from sqlalchemy.orm.session import Session
from flask_sqlalchemy_session import current_session
from uuid import uuid4
from sqlalchemy import and_, not_
from utils.entity_utils import entity_to_dict
from mysql.entities import *
domain_blueprint = Blueprint('domain', __name__)
session: Session = current_session
@domain_blueprint.route("/api/getDomains", methods=['GET'])
def get_domains():
res = [
{
'domain_id': domain.domain_id,
'domain_name': domain.domain_name
}
for domain in session.query(Domain).all()
]
return jsonify(res)
@domain_blueprint.route("/api/addDomains", methods=['POST'])
def add_domains():
key = request.headers.get('key')
assert key == 'deepcare@2020'
data = request.get_json()
assert type(data) is list
domains = [
Domain(
domain_id=str(uuid4()),
domain_name=domain['domain_name'],
next=domain['next'],
url=domain['url'],
thumbnail=domain['thumbnail'],
title=domain['title'],
summary=domain['summary'],
content=domain['content'],
tags=domain['tags'],
time=domain['time'],
start_urls=[
StartUrl(
start_url_id=str(uuid4()),
using_next_button=start_url['using_next_button'],
max_no_pages=start_url['max_no_pages'],
start_url=start_url['start_url']
)
for start_url in domain['start_urls']
]
)
for domain in data
]
try:
session.add_all(domains)
session.commit()
return 'OK, add domains successfully!!!'
except:
session.rollback()
return Response('Fail to add domains', status=500)
This diff is collapsed.
from flask import Flask
from flask_cors import CORS
from flask_sqlalchemy_session import flask_scoped_session
from mysql.engine import connect
from config import mysql_settings
from .marketing import marketing_news
from .dev import dev_news
from .user import user_blueprint
from .category import category_blueprint
from .domain import domain_blueprint
from .censor import censor_blueprint
def get_app():
Session = connect(
username=mysql_settings.USERNAME,
password=mysql_settings.PASSWORD,
db=mysql_settings.DATABASE,
host=mysql_settings.HOST,
port=mysql_settings.PORT
)
app = Flask(__name__)
cors = CORS(app, resources={r"/api/*": {"origins": "*"}})
session = flask_scoped_session(Session, app=app)
app.register_blueprint(marketing_news)
app.register_blueprint(dev_news)
app.register_blueprint(censor_blueprint)
app.register_blueprint(user_blueprint)
app.register_blueprint(category_blueprint)
app.register_blueprint(domain_blueprint)
return app
from flask import Blueprint, request, Response, jsonify
from flask_sqlalchemy_session import current_session
from uuid import uuid4
from sqlalchemy.orm.session import Session
from sqlalchemy import and_, not_
import requests
from treelib import Tree
from mysql.entities import *
from config import api_settings
user_blueprint = Blueprint('user', __name__)
session: Session = current_session
@user_blueprint.route('/api/login', methods=['POST'])
def login():
data = request.get_json()
username = data['username']
password = data['password']
print(username, password)
assert username and password
user = session.query(User).filter(User.name == username).first()
if user is None:
return Response('User is not existed!!', status=500)
if user.password != password:
return Response('Password is not correct!!!', status=500)
response_data = {
'username': username,
'user_id': user.user_id,
'role': user.role.name
}
return jsonify(response_data)
@user_blueprint.route("/api/getUserInfo", methods=['GET'])
def get_user_info():
user_id = request.headers.get('token')
user = session.query(User).filter(
and_(
User.user_id == user_id,
not_(User.deleted)
)
).first()
if user is not None:
res = jsonify({
'username': user.name,
'user_id': user.user_id,
'role': user.role.name
})
else:
res = jsonify(None)
return res
@user_blueprint.route('/api/register', methods=['POST'])
def register():
key = request.headers.get('key')
if key != 'deepcare2020':
return Response('api_key is not correct!!!', status=500)
data = request.get_json()
username = data['username']
password = data['password']
role = data['role']
user_existed = session.query(User).filter(User.name == username).count() > 0
if user_existed:
return Response('Username is existed!!!', status=500)
user = User(
user_id=uuid4(),
name=username,
password=password,
role=role
)
try:
session.add(user)
session.commit()
except:
return Response(status=500)
res = jsonify({
'username': user.name,
'role': role,
'user_id': user.user_id
})
return res
@user_blueprint.route('/api/runscript', methods=['POST'])
def run_script():
key = request.headers.get('key')
if key != 'deepcare@2020':
return Response('api_key is not correct!!!', status=500)
script = """
ALTER TABLE `mkt_edited_news`
ADD COLUMN `video` VARCHAR(100) NULL AFTER `author_name`,
ADD COLUMN `video_type` VARCHAR(50) NULL AFTER `video`;
"""
try:
session.execute(script)
session.commit()
except Exception as e:
return jsonify({
'status': 'KO',
'message': str(e)
})
return jsonify({
'status': 'OK',
'message': 'Done!!!'
})
def get_self_or_descendant(topic, parent, type):
topics = [{
'topic_id': topic['topic_id'],
'topic_name': topic['topic_name'],
'parent': parent,
'type': type
}]
for t in topic.get('children', []):
topics.extend(get_self_or_descendant(t, parent=topic['topic_id'], type=type))
return topics
def get_order_topics(topics, type):
tree = Tree()
tree.create_node(tag='Root', identifier='root')
for topic in topics:
for t in get_self_or_descendant(topic, None, type):
tree.create_node(
tag=t['topic_name'],
identifier=t['topic_id'],
parent=t['parent'] if t['parent'] else 'root',
data=t
)
return [tree[node].data for node in tree.expand_tree(mode=tree.DEPTH) if tree[node].data]
@user_blueprint.route('/api/updateCategories', methods=['POST'])
def update_categories():
key = request.headers.get('key')
if key != 'deepcare@2020':
return Response('api_key is not correct!!!', status=500)
try:
token_res = requests.get(api_settings.GEN_TOKEN_URL)
patient_topics = requests.get(
url=api_settings.GET_ALL_TOPIC_PATIENT,
headers={
'x-access-token': token_res.json()['token']
}
)
doctor_topics = requests.get(
url=api_settings.GET_ALL_TOPIC_DOCTOR,
headers={
'x-access-token': token_res.json()['token']
}
)
local_topic_ids = [t[0] for t in session.query(Category.category_id).filter(
not_(Category.deleted),
).all()]
order_patient_topics = get_order_topics(patient_topics.json()['result'], 'BENH_NHAN')
order_doctor_topics = get_order_topics(doctor_topics.json()['result'], 'BAC_SI')
all_new_topics = {}
for topic in [*order_patient_topics, *order_doctor_topics]:
if topic['topic_id'] in local_topic_ids: continue
if all_new_topics.get(topic['topic_id']) is not None: continue
all_new_topics[topic['topic_id']] = topic
for topic in all_new_topics.values():
session.add(
Category(
category_id=topic['topic_id'],
category_name=topic['topic_name'],
parent_id=topic['parent'],
deleted=0,
type=topic['type']
)
)
session.commit()
return jsonify({
'status': 'OK',
'message': 'Update successfully!!!'
})
except Exception as e:
return jsonify({
'status': 'KO',
'message': f'Failed to update categories. Error: {str(e)}'
})
import pandas as pd
import os
categories = {
'an_toan_thuc_pham': "An toàn thực phẩm",
'co_xuong_khop': "Cơ xương khớp",
'da_lieu': "Da liễu",
'dinh_duong': "Dinh dưỡng",
'ho_hap': "Hô hấp",
'huyet_hoc': "Huyết học",
# 'kham_lam_sang': "Khám lâm sàng",
'khoe_dep': "Khỏe đẹp",
'mat': "Mắt",
'nam_gioi': "Nam giới",
'nhi': "Nhi",
'nu_gioi': "Nữ giới",
'rang': "Răng",
'san_phu_khoa': "Sản phụ khoa",
'tai_mui_hong': "Tai mũi họng",
'tam_than': "Tâm thần",
'than_kinh': "Thần kinh",
'than_nieu': "Thận niệu",
'thuoc': "Thuốc",
'thuoc_va_thuc_pham': "Thuốc và thực phẩm",
'tieu_duong': "Tiểu đường",
'tieu_hoa': "Tiêu hóa",
'tim_mach': "Tim mạch",
'ung_thu': "Ung thư",
'y_hoc_co_truyen': "Y học cổ truyền"
}
category2index = {}
label2id = {}
id2name = {}
for i, (key, value) in enumerate(categories.items()):
category2index[key] = i
df = pd.read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'mapping_categories.csv'), sep='\t')
for category in df.itertuples():
label2id[category.label] = category.category_id
id2name[category.category_id] = category.category_name
import fasttext
import re
from .categories import label2id
class NewsClassifier:
def __init__(self, model_path):
self.model = fasttext.load_model(model_path)
def predict(self, text, process_new_line=True, process_num=True):
if not process_new_line:
text = re.sub(r'\s+', ' ', text)
if not process_num:
text = re.sub(r' \d+ ', '__NUM__', text)
return label2id.get(self.model.predict(text)[0][0])
def predict_many(self, docs, process_new_line=True, process_num=True):
if not process_new_line:
docs = [re.sub(r'\s+', ' ', doc) for doc in docs]
if not process_num:
docs = [re.sub(r' \d+ ', '__NUM__', doc) for doc in docs]
preds = self.model.predict(docs)
preds = [label2id[pred[0]] for pred in preds[0]]
return preds
label category_id category_name deleted parent_id type
__label__ TP-hSDs7CSz92TB5W1asHxVGNtT6S4W4ZnZ8roNMl1n65 Sống Khoẻ Mỗi Ngày 0 BENH_NHAN
__label__ TP-PAAaJdy95b1dgoJPlVPXVE9FMJK1xpUOps2FPqwqDB Kiến Thức Y Học 0 BENH_NHAN
__label__ TP-12NKT2IPhvNfgotlnjDWmYBGTwuc4ZvDuhR65t6cHC Dịch Virus Corona 0 TP-PAAaJdy95b1dgoJPlVPXVE9FMJK1xpUOps2FPqwqDB BENH_NHAN
__label__ TP-14OzlPP7VR5spVjvyDSc7QSMrQHNFMPe8Udi40wPP1 Nội tiết 0 TP-PAAaJdy95b1dgoJPlVPXVE9FMJK1xpUOps2FPqwqDB BENH_NHAN
__label__ TP-1C99lvUIqvURViO9WZaGTNaNN00Ds35BhBHTR5biN5 Sức Khoẻ Tâm Lý 0 TP-hSDs7CSz92TB5W1asHxVGNtT6S4W4ZnZ8roNMl1n65 BENH_NHAN
__label__ TP-1Ip1eIEWv4Fx8rJFszQO1ZQNIJe1RB4olByO4NEc48 Sức Khoẻ Tình Dục 0 TP-hSDs7CSz92TB5W1asHxVGNtT6S4W4ZnZ8roNMl1n65 BENH_NHAN
__label__16 TP-2rZIbCAhvyGeTatw8MY30ODzcsZcrJk45qzKcI0Xju Thận Niệu 0 TP-PAAaJdy95b1dgoJPlVPXVE9FMJK1xpUOps2FPqwqDB BENH_NHAN
__label__4 TP-36ZUNfh4trpmwdnszoENobjf0R0lGGVIWVNOoQB4Rw Hô Hấp 0 TP-PAAaJdy95b1dgoJPlVPXVE9FMJK1xpUOps2FPqwqDB BENH_NHAN
__label__ TP-3fef24CIeRNUB7BBpIN4J7bPmkdlq6Is3wLZ2n7PLc Sức Khoẻ Tuổi Già 0 TP-hSDs7CSz92TB5W1asHxVGNtT6S4W4ZnZ8roNMl1n65 BENH_NHAN
__label__21 TP-4TTlZY2sNKbLPWO09b7IC8KsMXmiAlSc1LD10q92qj Tim Mạch 0 TP-PAAaJdy95b1dgoJPlVPXVE9FMJK1xpUOps2FPqwqDB BENH_NHAN
__label__ TP-5YFQkvQAl0j9iO72g442Z9yhQ4L44zBJELN5dSWaIl Bệnh Mãn Tính 0 TP-PAAaJdy95b1dgoJPlVPXVE9FMJK1xpUOps2FPqwqDB BENH_NHAN
__label__ TP-891Ydf1cOs1zP9O1PiUrK6DiDG39p3wl8AW9X5DzBt Sức Khoẻ Tuổi Teen 0 TP-hSDs7CSz92TB5W1asHxVGNtT6S4W4ZnZ8roNMl1n65 BENH_NHAN
__label__11 TP-9Gv6JmgExxUezDHMjG0GQy6LXFa0CIaxP9mLxvNp5Q Chăm Sóc Răng Miệng 0 TP-hSDs7CSz92TB5W1asHxVGNtT6S4W4ZnZ8roNMl1n65 BENH_NHAN
__label__20 TP-DIq4GcY116QTjJ4sylqSe1BNz6yr4Z9G7Nk1C17936 Tiêu Hoá 0 TP-PAAaJdy95b1dgoJPlVPXVE9FMJK1xpUOps2FPqwqDB BENH_NHAN
__label__15 TP-DnK3wcdvUTDZztCzE1BIsLMJTlud8VVW6OFPhb1StZ Thân Kinh 0 TP-PAAaJdy95b1dgoJPlVPXVE9FMJK1xpUOps2FPqwqDB BENH_NHAN
__label__14 TP-DnO1e8WQhQ9NC5IAknLsl1GKLXuTuJz1drijiCeJ2B Tâm Thần 0 TP-PAAaJdy95b1dgoJPlVPXVE9FMJK1xpUOps2FPqwqDB BENH_NHAN
__label__6 TP-HLMLdeEL7qy2XDFO7MvtRwrj7oSSXOxmuiQMU7N8cZ Phụ Nữ Khoẻ Đẹp 0 TP-hSDs7CSz92TB5W1asHxVGNtT6S4W4ZnZ8roNMl1n65 BENH_NHAN
__label__ TP-Jm7qk9NX7i7fvXNfl2urpBhgaf22jDUBtFNFG6xeWl Tập Luyện 0 TP-hSDs7CSz92TB5W1asHxVGNtT6S4W4ZnZ8roNMl1n65 BENH_NHAN
__label__19 TP-L47Ka1M84eaiELWbjqnxWEU8mRn0XLGwKtj8SYiS8i Bệnh Tiểu Đường/Đái Tháo Đường 0 TP-PAAaJdy95b1dgoJPlVPXVE9FMJK1xpUOps2FPqwqDB BENH_NHAN
__label__9 TP-RNE1eANAilG9yIs0wHnfWg4h4WYaweEADT9560CHxD Sức Khoẻ Của Bé 0 TP-hSDs7CSz92TB5W1asHxVGNtT6S4W4ZnZ8roNMl1n65 BENH_NHAN
__label__22 TP-SFZ4MTffK0tR89Rd8gmq6O941GeX9EP72FsvyM91pC Ung Thu 0 TP-PAAaJdy95b1dgoJPlVPXVE9FMJK1xpUOps2FPqwqDB BENH_NHAN
__label__12 TP-UrdEd6SQbd25jT3F1iCdT6bG1GEG6zUr26MjbWCqs6 Sản Phụ Khoa 0 TP-PAAaJdy95b1dgoJPlVPXVE9FMJK1xpUOps2FPqwqDB BENH_NHAN
__label__ TP-VgT6eEyEQ2HDZ2OWHxQjx6MZrdz2Tcmct6Q2d3k23e Giảm Cân 0 TP-hSDs7CSz92TB5W1asHxVGNtT6S4W4ZnZ8roNMl1n65 BENH_NHAN
__label__ TP-W0jp2iNsz4I40LSe0Z2Yug0AFlns5jobYVQ76F2iGd Tăng Cân 0 TP-hSDs7CSz92TB5W1asHxVGNtT6S4W4ZnZ8roNMl1n65 BENH_NHAN
__label__3 TP-eGXa9REg8mGwTt9mMVxpV2AA9H6n88ohiMYQFXBskD Dinh Dưỡng 0 TP-hSDs7CSz92TB5W1asHxVGNtT6S4W4ZnZ8roNMl1n65 BENH_NHAN
__label__2 TP-f8QDnv0HuziQxfnkMxJrIAZ211lt1I8xvE3FZN0ufI Da Liễu 0 TP-PAAaJdy95b1dgoJPlVPXVE9FMJK1xpUOps2FPqwqDB BENH_NHAN
__label__5 TP-gb44HUxWogvqDwqwXBScU65luWqS5wH6YGhig50fti Huyết Học 0 TP-PAAaJdy95b1dgoJPlVPXVE9FMJK1xpUOps2FPqwqDB BENH_NHAN
__label__1 TP-hlG11P14iZyZ2yNEzJNP5Yb92ZIelsRiU58Amf1w8z Cơ-Xương-Khớp 0 TP-PAAaJdy95b1dgoJPlVPXVE9FMJK1xpUOps2FPqwqDB BENH_NHAN
__label__23 TP-jd0ckT919RXiIVbDaevH9pumpGWGAtD9l73DD0c9s8 Y Học Cổ Truyển 0 TP-PAAaJdy95b1dgoJPlVPXVE9FMJK1xpUOps2FPqwqDB BENH_NHAN
__label__13 TP-qwIYQSiJLAnwkF9A4vJh8r0vMima7anM5nqOuUT8r9 Tai Mũi Họng 0 TP-PAAaJdy95b1dgoJPlVPXVE9FMJK1xpUOps2FPqwqDB BENH_NHAN
__label__ TP-rd5od8099eMvk22KFo9zV5feDXYSuE82pD4wV9meW2 Bệnh Dị Ứng 0 TP-PAAaJdy95b1dgoJPlVPXVE9FMJK1xpUOps2FPqwqDB BENH_NHAN
__label__ TP-ufr1oDYssmOB2P1E1PHtB11KiTn1DZNf3NiI1D3WL4 Răng Hàm Mặt 0 TP-PAAaJdy95b1dgoJPlVPXVE9FMJK1xpUOps2FPqwqDB BENH_NHAN
__label__8 TP-v55eBt6NVaJnRF6DTZZSJ5HCCly6j4itQ4oeYc6k1q Sức Khoẻ Nam Giới 0 TP-hSDs7CSz92TB5W1asHxVGNtT6S4W4ZnZ8roNMl1n65 BENH_NHAN
__label__18 TP-wk7jYUswMrCbdm0Pezb0fT2zc4uqr9ahntFnoNxfN2 Thuốc Và Thực Phẩm 0 TP-PAAaJdy95b1dgoJPlVPXVE9FMJK1xpUOps2FPqwqDB BENH_NHAN
__label__7 TP-wtil4V95jbxu0ZSrRjmnei9iYGruyuBV2BneR3PPr9 Mắt 0 TP-PAAaJdy95b1dgoJPlVPXVE9FMJK1xpUOps2FPqwqDB BENH_NHAN
label topic_name topic_id parent level branch create_at topic_en topic_fr
__label__0 An toàn thực phẩm TP-dBd2NKk9zF1p09lT38ho90YJsqW2b94qf5Y1PN8b9V 1 TP-dBd2NKk9zF1p09lT38ho90YJsqW2b94qf5Y1PN8b9V 1581411210111
__label__1 Cơ xương khớp TP-B6QL1djFPAHGNItc6NubQ6TWONg2oFs9Q3JV4WTvuw 1 TP-B6QL1djFPAHGNItc6NubQ6TWONg2oFs9Q3JV4WTvuw 1581411212602
__label__2 Da liễu TP-H02I4fQ4WMm3s3BFlZ2Ccni5NmNs1145kVZCWItD3P 1 TP-H02I4fQ4WMm3s3BFlZ2Ccni5NmNs1145kVZCWItD3P 1581411215459
__label__3 Dinh dưỡng TP-hi0l84cdNdT0pmFrehXkX2VvrurNww4zm1Y2S2LxCg 1 TP-hi0l84cdNdT0pmFrehXkX2VvrurNww4zm1Y2S2LxCg 1581411217950
__label__4 Hô hấp TP-riAxk9FBOhmth99dImrgNoD9r9zuGiELBTOKvIXlMd 1 TP-riAxk9FBOhmth99dImrgNoD9r9zuGiELBTOKvIXlMd 1581411223010
__label__5 Huyết học TP-HwqOVVkvIt18oLJVWk1v683zlarQJwPBLz0HHuHd8C 1 TP-HwqOVVkvIt18oLJVWk1v683zlarQJwPBLz0HHuHd8C 1581411220505
__label__6 Khỏe đẹp TP-vEXWfsB8BA4AK6Y7at3YsTjoZaZ6JDC635bBYhtDbC 1 TP-vEXWfsB8BA4AK6Y7at3YsTjoZaZ6JDC635bBYhtDbC 1581411225731
__label__7 Mắt TP-5tgE8zu7FA3KP06rIJhewaoksFLmPyQEKpDAvgQe4h 1 TP-5tgE8zu7FA3KP06rIJhewaoksFLmPyQEKpDAvgQe4h 1581411228227
__label__8 Nam giới TP-GKpm0S6864VMD8MEE08eJ7qsXxoYgo6sC6FQ1HU5DU 1 TP-GKpm0S6864VMD8MEE08eJ7qsXxoYgo6sC6FQ1HU5DU 1581411230724
__label__9 Nhi TP-MGb5V1n0l0O9nr0jKJ0150OdaFz71lFwdxj0lT0iZh 1 TP-MGb5V1n0l0O9nr0jKJ0150OdaFz71lFwdxj0lT0iZh 1581411233213
__label__10 Nữ giới TP-dPmaJkUsBTaFf5q09mo36RXU1A57BqCjT6a90luYMG 1 TP-dPmaJkUsBTaFf5q09mo36RXU1A57BqCjT6a90luYMG 1581411235910
__label__11 Răng TP-izkL313lDwnf7LPUcyd7Lv5nBaWbG6sZAIvwVDAXR2 1 TP-izkL313lDwnf7LPUcyd7Lv5nBaWbG6sZAIvwVDAXR2 1581411238401
__label__12 Sản phụ khoa TP-bl91nus82gwEOc91DHN9HeytXdq2cyFeTbc9FILz00 1 TP-bl91nus82gwEOc91DHN9HeytXdq2cyFeTbc9FILz00 1581411241119
__label__13 Tai mũi họng TP-8L84oJAHOO4PeyUp4yv2wqI4h71yH1Im4z4Mu0xcIE 1 TP-8L84oJAHOO4PeyUp4yv2wqI4h71yH1Im4z4Mu0xcIE 1581411243608
__label__14 Tâm thần TP-MgW7UZrbbTcI2CjU4V7GCxdHO8nUv5H9ZNWI2BObGu 1 TP-MgW7UZrbbTcI2CjU4V7GCxdHO8nUv5H9ZNWI2BObGu 1581411261128
__label__15 Thần kinh TP-dLoUSEk810sOPzTW5t16iNXUzFeNBONSIsg71vZnc9 1 TP-dLoUSEk810sOPzTW5t16iNXUzFeNBONSIsg71vZnc9 1581411251096
__label__16 Thận niệu TP-6sBTBvZIvz7hT7rRDMA6CMl4EMgtA7wGdORFDQUgXs 1 TP-6sBTBvZIvz7hT7rRDMA6CMl4EMgtA7wGdORFDQUgXs 1581411253592
__label__17 Thuốc TP-K8o4j9P353UojCeEerfGGLsq42ruAiN99QyYAzNZYu 1 TP-K8o4j9P353UojCeEerfGGLsq42ruAiN99QyYAzNZYu 1581411246118
__label__18 Thuốc và thực phẩm TP-ji9AoH4dmj7OSOIg4q7CB7FuadF4v6zZLmIJZcAMts 1 TP-ji9AoH4dmj7OSOIg4q7CB7FuadF4v6zZLmIJZcAMts 1581411248604
__label__19 Bệnh đái tháo đường TP-1572434590216 1 TP-1572434590216 1572434590216 Diabetes
__label__20 Tiêu hóa TP-wycybdK378eqc6C6RjAdDh3M5snIYiTJp8F3F55zNC 1 TP-wycybdK378eqc6C6RjAdDh3M5snIYiTJp8F3F55zNC 1581411258597
__label__21 Tim mạch TP-3rD5KzEW7O33go5ICVu5geJiiW57VVzKqND7lEKsQI 1 TP-3rD5KzEW7O33go5ICVu5geJiiW57VVzKqND7lEKsQI 1581411256103
__label__22 Ung thư TP-9IoHH62b2IN4UBPT2hdr8QeZ2aO5LQKZT8UY6SoYXx 1 TP-9IoHH62b2IN4UBPT2hdr8QeZ2aO5LQKZT8UY6SoYXx 1581411263626
__label__23 Y học cổ truyền TP-4m462ME26kY49fmBofqQAVltRgEsVDlZhQ3j4egnFV 1 TP-4m462ME26kY49fmBofqQAVltRgEsVDlZhQ3j4egnFV 1581411266152
import re
import os
from collections import defaultdict
from utils.read_data import load_jsonl_file
import pandas as pd
from html import unescape
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import fasttext
from .categories import category2index
RAW_DIR = 'classification/classification_category_raw_data'
PROCESS_DIR = 'classification/classification_data'
FASTTEXT_DIR = 'classification/fasttext_data/categories_data'
def process(input_dir, output_dir):
categories = defaultdict(list)
for fn in os.listdir(input_dir):
category = fn.rsplit(r'_', maxsplit=1)[0]
categories[category].extend(load_jsonl_file(os.path.join(input_dir, fn)))
for category, docs in categories.items():
df = pd.DataFrame(
data=docs,
index=list(range(len(docs)))
)
df['title'].replace(
to_replace=r'(\(VietQ.vn\)|Suckhoedoisong.vn|VOV.VN|VTV.vn|^Dân trí |[±\-+= ¯]+)',
value=' ',
regex=True,
inplace=True
)
df['description'].replace(
to_replace=r'(\(VietQ.vn\)|Suckhoedoisong.vn|VOV.VN|VTV.vn|^Dân trí |[±\-+= ¯]+)',
value=' ',
regex=True,
inplace=True
)
df['content'].replace(
to_replace=r'(\(VietQ.vn\)|Suckhoedoisong.vn|VOV.VN|VTV.vn|^Dân trí |[±\-+= ¯]+)',
value=' ',
regex=True,
inplace=True
)
df.to_csv(os.path.join(output_dir, f'{category}.csv'), sep='\t', index=False)
def csv2fasttext_data(input_dir, output_dir, limit=5000):
train_dfs = []
test_dfs = []
for fn in os.listdir(input_dir):
category = fn.split('.')[0]
df = pd.read_csv(os.path.join(input_dir, fn), sep='\t')
if len(df) > limit:
df = df.sample(limit, random_state=42)
df.fillna('', inplace=True)
df['text'] = df['title'] + '. ' + df['description'] + '. ' + df['content']
df['text'] = unescape(df['text'])
df['text'].replace(to_replace=r'(\.+\s+|\s+)', value=' ', inplace=True, regex=True)
df['text'].replace(to_replace=r' \d+ ', value=' __NUM__ ', inplace=True, regex=True)
df['label'] = f'__label__{category2index[category]}'
train_df, test_df = train_test_split(df[['label', 'text']], test_size=0.2, random_state=42)
train_dfs.append(train_df)
test_dfs.append(test_df)
train = pd.concat(train_dfs)
test = pd.concat(test_dfs)
train = shuffle(train)
test = shuffle(test)
train_data = train.to_dict(orient='record')
test_data = test.to_dict(orient='record')
with open(os.path.join(output_dir, 'train.txt'), mode='w', encoding='utf8') as f:
for doc in train_data:
f.write(doc['label'])
f.write(' ')
f.write(doc['text'])
f.write('\n')
f.close()
with open(os.path.join(output_dir, 'test.txt'), mode='w', encoding='utf8') as f:
for doc in test_data:
f.write(doc['label'])
f.write(' ')
f.write(doc['text'])
f.write('\n')
f.close()
def train_fasttext(
train_fn,
ws,
epoch,
loss,
dim,
wordNgrams,
lr,
minCount):
model = fasttext.train_supervised(train_fn, ws=ws, epoch=epoch, loss=loss, dim=dim, wordNgrams=wordNgrams,
lr=lr, minCount=minCount)
return model
def load_fasttext_model(model_fn):
model = fasttext.load_model(model_fn)
return model
def test_fasttext_model(model, test_fn):
result = model.test(test_fn)
print(result)
return result
def save_model(model, model_fn):
model.save_model(model_fn)
# def train():
# ws = 25
# epoch = 250
# loss = 'softmax'
# dim = 150
# wordNgrams = 3
# lr = 0.2
# minCount = 3
#
#
\ No newline at end of file
[Unit]
Requires=network.target
After=crawler.service
[Service]
Type=simple
User=root
ExecStart=/datadrive/deepcare/crawler/start-crawler-api.sh
ExecStop=/datadrive/deepcare/crawler/stop-crawler-api.sh
Restart=on-abnormal
[Install]
WantedBy=multi-user.target
\ No newline at end of file
[Unit]
Requires=network.target
After=crawler.service
[Service]
Type=simple
User=root
ExecStart=/datadrive/deepcare/crawler/start-crawler-processors.sh
ExecStop=/datadrive/deepcare/crawler/stop-crawler-processors.sh
Restart=on-abnormal
[Install]
WantedBy=multi-user.target
\ No newline at end of file
[Unit]
Requires=network.target
After=
[Service]
Type=simple
User=root
ExecStart=/datadrive/deepcare/crawler/start-crawler.sh
ExecStop=/datadrive/deepcare/crawler/stop-crawler.sh
Restart=on-abnormal
[Install]
WantedBy=multi-user.target
from scrapy import Item, Field
class NewsHeaderItem(Item):
title = Field()
image = Field()
summary = Field()
create_at = Field()
content = Field()
status = Field()
original_url = Field()
domain_id = Field()
from scrapy import Request, Spider
import re
from dateutil.parser import parse
from requests.utils import requote_uri
from langdetect import detect
from utils.date_utils import get_full_time
from utils.html_process import clean_html
from langdetect.lang_detect_exception import LangDetectException
import time
lang_detect2language = {
'vi': 'VN',
'en': 'EN',
'fr': 'FR'
}
class NewsCrawler(Spider):
def __init__(self, name=None, **kwargs):
super(NewsCrawler, self).__init__(name, **kwargs)
self.metadata = kwargs
self.domain = kwargs.get('domain')
self.allowed_domains = [self.domain.domain_name]
self.xpath = self.process_xpath_details(self.domain.__dict__)
self.crawled_urls = set(self.domain.crawled_urls)
def start_requests(self):
for start_url in self.domain.start_urls:
if start_url.using_next_button:
yield Request(url=start_url.start_url, callback=self.parse_using_next)
else:
url = re.sub(r'___NUM___', str(1), start_url.start_url)
yield Request(url=url, callback=self.parse_not_using_next, meta={'index': 1, 'start_url': start_url})
def parse(self, response):
pass
def parse_using_next(self, response):
next_page = response.xpath(self.domain.next + '/@href').get()
news_urls = [requote_uri(re.sub(r'\?.+$', '', response.urljoin(news_url))) for news_url in
response.xpath(self.domain.url + '/@href').getall()]
thumbnails = response.xpath(self.domain.thumbnail).getall()
assert len(news_urls) == len(thumbnails)
url_vs_thumbnail = [(news_url, thumbnail_image_src) for news_url, thumbnail_image_src
in zip(news_urls, thumbnails) if news_url not in self.crawled_urls]
for news_url, thumbnail in url_vs_thumbnail:
thumbnail = response.urljoin(thumbnail.split()[0])
yield Request(url=news_url, callback=self.parse_result, meta={'thumbnail': thumbnail})
if len(url_vs_thumbnail) > 0:
if next_page is not None:
next_page = response.urljoin(next_page)
yield Request(url=next_page, callback=self.parse_using_next)
def parse_not_using_next(self, response):
current_page_index = response.meta['index']
start_url = response.meta['start_url']
news_urls = [response.urljoin(news_url) for news_url in
response.xpath(self.domain.url + '/@href').getall()]
thumbnails = response.xpath(self.domain.thumbnail).getall()
assert len(news_urls) == len(thumbnails)
url_vs_thumbnail = [(news_url, thumbnail_image_src) for news_url, thumbnail_image_src
in zip(news_urls, thumbnails) if news_url not in self.crawled_urls]
for news_url, thumbnail in url_vs_thumbnail:
news_url = response.urljoin(news_url)
thumbnail = response.urljoin(thumbnail.split()[0])
yield Request(url=news_url, callback=self.parse_result, meta={'thumbnail': thumbnail})
if len(url_vs_thumbnail) > 0:
yield Request(url=re.sub(r'___NUM___', str(current_page_index + 1), start_url.start_url),
callback=self.parse_not_using_next,
meta={'index': current_page_index + 1, 'start_url': start_url})
def parse_result(self, response):
self.crawled_urls.add(response.request.url)
thumbnail = response.meta['thumbnail']
title = self.concat_strip_strings(response.xpath(self.xpath['title']).getall())
if not title: return None
summary = self.concat_strip_strings(response.xpath(self.xpath['summary']).getall()) if \
self.xpath['summary'] else ""
full_time = (get_full_time(self.concat_strip_strings(response.xpath(self.xpath['time']).getall()))
if self.xpath['time']
else time.strftime("%Y-%m-%d %H:%M:%S"))
if not full_time: full_time = time.strftime("%Y-%m-%d %H:%M:%S")
time_obj = parse(full_time) if full_time else None
tags = [tag.strip() for tag in response.xpath(self.xpath['tags']).getall()] if \
self.xpath['tags'] else []
try:
content = self.parse_content(response)
except AttributeError as e:
self.logger.error(f'{e}: {response.request.url}')
content = None
try:
language = lang_detect2language.get(detect(title if title else summary), 'OTHER')
except LangDetectException as e:
self.logger.error(f'{e}: {response.request.url}')
language = "OTHER"
if content:
yield {
'title': title,
'image': thumbnail,
'summary': summary,
'create_at': full_time,
'timestamp': time_obj.timestamp() if time_obj else time.time(),
'content': content,
'tags': ','.join(tags),
'origin_url': response.request.url,
'domain_id': self.domain.domain_id,
'language': language
}
def parse_content(self, response):
element = response.xpath(self.domain.content).get()
if element is None:
return None
result = clean_html(element)
return result
@staticmethod
def concat_strip_strings(l):
l = [s.strip() for s in l if s]
return ' '.join([s for s in l if s])
@staticmethod
def process_xpath_details(xpath_details):
result = dict()
result['title'] = xpath_details['title'] + '//text()'
if xpath_details['summary']:
result['summary'] = xpath_details['summary'] + '//text()' \
if not xpath_details['summary'].endswith('text()') else xpath_details['summary']
else:
result['summary'] = ''
result['time'] = xpath_details['time'] + '//text()' if not xpath_details['time'].startswith('substring') \
else xpath_details['time']
result['content'] = xpath_details['content']
result['tags'] = xpath_details['tags'] + "/text()" if xpath_details['tags'] else ''
return result
class CrawlerWrapper:
@staticmethod
def news_crawler_object(super_crawler_class, fn):
class Crawler(super_crawler_class):
custom_settings = {
'FEED_URI': f'data/{fn}.jsonl'
}
return Crawler
from kafka_module.kafka_producer import Producer
import json
class KafkaItemPipeline:
def __init__(self, bootstrap_servers, topic):
self.bootstrap_servers = bootstrap_servers
self.topic = topic
self.producer = None
@classmethod
def from_crawler(cls, crawler):
return cls(
bootstrap_servers=crawler.settings.get('BOOTSTRAP_SERVERS'),
topic=crawler.settings.get('TOPIC')
)
def open_spider(self, spider):
self.producer = Producer(self.bootstrap_servers)
def close_spider(self, spider):
if self.producer:
self.producer.close()
def process_item(self, item, spider):
if type(item) == dict:
item_bytes = json.dumps(item, ensure_ascii=False).encode('utf-8')
self.producer.send_message(self.topic, item_bytes)
from sqlalchemy.ext.automap import automap_base
from sqlalchemy import Column, String, Boolean, ForeignKey
from sqlalchemy.engine import create_engine
Base = automap_base()
class NewsHeader(Base):
__tablename__ = 'news_header'
news_header_id = Column(String, nullable=False, primary_key=True)
title = Column(String, nullable=False)
image = Column(String, nullable=False)
summary = Column(String, nullable=False)
create_at = Column(String, nullable=False)
content = Column(String, nullable=False)
status = Column(String, nullable=False, default='not_edited')
origin_url = Column(String, nullable=False)
domain_id = Column(String, ForeignKey('domain.domain_id'))
deleted = Column(Boolean, nullable=False, default=False)
def connect(username, password, db, host='localhost', port=3306):
engine = create_engine(f"mysql://{username}:{password}@{host}:{port}/{db}")
Base.prepare(engine, reflect=True)
return engine
from sqlalchemy.orm import sessionmaker
from uuid import uuid4
from ..items.news_header import NewsHeaderItem
from .models import connect, NewsHeader
class MySQLPipeline(object):
def __init__(self, username, password, host, port, db):
"""
Initializes database connection and sessionmaker.
Creates deals table.
"""
engine = connect(
username=username,
password=password,
host=host,
port=port,
db=db
)
self.Session = sessionmaker(bind=engine)
# self.session = self.Session()
@classmethod
def from_crawler(cls, crawler):
return cls(
username=crawler.settings.get('MYSQL_USERNAME'),
password=crawler.settings.get('MYSQL_PASSWORD'),
host=crawler.settings.get('MYSQL_HOST'),
port=crawler.settings.get('MYSQL_PORT'),
db=crawler.settings.get('MYSQL_DB')
)
def process_item(self, item: NewsHeaderItem, spider):
session = self.Session()
try:
if type(item) is NewsHeaderItem:
session.add(NewsHeader(
news_header_id=str(uuid4()),
title=item.title,
image=item.image,
summary=item.summary,
create_at=item.create_at,
content=item.content,
status='not_edited',
origin_url=item.origin_url,
domain_id=item.domain_id
))
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
import requests
import pandas as pd
from treelib import Node, Tree
from uuid import uuid4
import time
key = {
"token":"eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE1ODIwMTQ3OTQwMDB9.SlpAo6M4a2d9cQqFS8Mcrvo0OBDe5AhS0GiH9px-hY8"
}
def get_branch(node):
depth = tree.level(node)
if depth <= 1:
return None
else:
up_level = depth - 1
i = 0
while i < up_level:
node = tree.parent(node).identifier
i += 1
return tree[node]
categories_df = pd.read_csv('init_data/new_categories.csv', sep=',')
categories_df.fillna('', inplace=True)
tree = Tree()
tree.create_node(tag='Root', identifier='root')
for category in categories_df.itertuples():
tree.create_node(
tag=category.category_name,
identifier=category.category_id,
parent=category.parent_id if category.parent_id else 'root',
data={
'category_id': category.category_id,
'category_name': category.category_name,
'parent_id': category.parent_id if category.parent_id else None
}
)
tree.show()
for node in tree.expand_tree(mode=tree.DEPTH):
data = tree[node].data
if data:
branch = get_branch(node)
res = requests.post(
url='http://dev.deepcare.io:8181/api/v1/cms/news/addTopic',
json={
'topic_name': data['category_name'],
'parent': data['parent_id'],
'level': tree.level(node),
'branch': branch.data['category_id'] if branch else None
},
headers={
'content-type': 'application/json',
'x-access-token': key['token']
}
)
# print({
# 'topic_name': data['category_name'],
# 'parent': data['parent_id'],
# 'level': tree.level(node),
# 'branch': branch.data['category_id'] if branch else None
# })
print(res.json())
res_data = res.json()
print(data['category_name'], res_data.get('status'))
if res_data.get('status') == 'OK':
topic_id = res_data.get('topic_id')
data['category_id'] = topic_id
for child in tree.children(node):
child.data['parent_id'] = topic_id
time.sleep(2)
# new_id = str(uuid4())
# data['category_id'] = new_id
# for child in tree.children(node):
# child.data['parent_id'] = new_id
#
# print({
# 'topic_name': data['category_name'],
# 'parent': data['parent_id'],
# 'level': tree.level(node),
# 'branch': branch.data['category_id'] if branch else None
# })
categories = []
for node in tree.expand_tree():
category = tree[node].data
if category:
categories.append(category)
df = pd.DataFrame(categories)
df.to_csv('test_categories.csv', sep=',', index=False)
category_id,category_name,deleted,parent_id
0,"An toàn thực phẩm",0,NULL
1,"Cơ xương khớp",0,NULL
10,Nhi,0,NULL
11,"Nữ giới",0,NULL
12,Răng,0,NULL
13,"Sản phụ khoa",0,NULL
14,"Tai mũi họng",0,NULL
15,"Tâm thần",0,NULL
16,"Thần kinh",0,NULL
17,"Thận niệu",0,NULL
18,Thuốc,0,NULL
19,"Thuốc và thực phẩm",0,NULL
2,"Da liễu",0,NULL
20,"Tiểu đường",0,NULL
21,"Tiêu hóa",0,NULL
22,"Tim mạch",0,NULL
23,"Ung thư",0,NULL
24,"Y học cổ truyền",0,NULL
25,"Đại cương, phân loại và cơ chế bệnh sinh",0,20
26,"Triệu chứng lâm sàng, theo dõi bệnh",0,20
27,"Biến chứng bệnh",0,20
28,"Điều trị và dự phòng",0,20
29,"Ăn uống và tập luyện ",0,20
3,"Dinh dưỡng",0,NULL
4,"Hô hấp",0,NULL
5,"Huyết học",0,NULL
6,"Khám lâm sàng",0,NULL
7,"Khỏe đẹp",0,NULL
8,Mắt,0,NULL
9,"Nam giới",0,NULL
topic_id,topic_name,parent,level,branch,create_at,topic_en,topic_fr
TP-1572434590216,Bệnh đái tháo đường,,1,TP-1572434590216,1572434590216,Diabetes,
TP-1572434716247,"Đại cương, phân loại và cơ chế bệnh sinh",TP-1572434590216,2,TP-1572434590216,1572434716247,"Outline, classification and pathogenesis mechanism",
TP-1572434723922,"Triệu chứng lâm sàng, theo dõi bệnh",TP-1572434590216,2,TP-1572434590216,1572434723922,,
TP-1572434732196,Biến chứng bệnh,TP-1572434590216,2,TP-1572434590216,1572434732196,Diabetes complications,
TP-1572434739627,Điều trị và dự phòng,TP-1572434590216,2,TP-1572434590216,1572434739627,,
TP-1572434745211,Ăn uống và tập luyện ,TP-1572434590216,2,TP-1572434590216,1572434745211,Eating and exercising,
TP-3rD5KzEW7O33go5ICVu5geJiiW57VVzKqND7lEKsQI,Tim mạch,,1,TP-3rD5KzEW7O33go5ICVu5geJiiW57VVzKqND7lEKsQI,1581411256103,"",""
TP-4m462ME26kY49fmBofqQAVltRgEsVDlZhQ3j4egnFV,Y học cổ truyền,,1,TP-4m462ME26kY49fmBofqQAVltRgEsVDlZhQ3j4egnFV,1581411266152,"",""
TP-5tgE8zu7FA3KP06rIJhewaoksFLmPyQEKpDAvgQe4h,Mắt,,1,TP-5tgE8zu7FA3KP06rIJhewaoksFLmPyQEKpDAvgQe4h,1581411228227,"",""
TP-6sBTBvZIvz7hT7rRDMA6CMl4EMgtA7wGdORFDQUgXs,Thận niệu,,1,TP-6sBTBvZIvz7hT7rRDMA6CMl4EMgtA7wGdORFDQUgXs,1581411253592,"",""
TP-8L84oJAHOO4PeyUp4yv2wqI4h71yH1Im4z4Mu0xcIE,Tai mũi họng,,1,TP-8L84oJAHOO4PeyUp4yv2wqI4h71yH1Im4z4Mu0xcIE,1581411243608,"",""
TP-9IoHH62b2IN4UBPT2hdr8QeZ2aO5LQKZT8UY6SoYXx,Ung thư,,1,TP-9IoHH62b2IN4UBPT2hdr8QeZ2aO5LQKZT8UY6SoYXx,1581411263626,"",""
TP-B6QL1djFPAHGNItc6NubQ6TWONg2oFs9Q3JV4WTvuw,Cơ xương khớp,,1,TP-B6QL1djFPAHGNItc6NubQ6TWONg2oFs9Q3JV4WTvuw,1581411212602,"",""
TP-bl91nus82gwEOc91DHN9HeytXdq2cyFeTbc9FILz00,Sản phụ khoa,,1,TP-bl91nus82gwEOc91DHN9HeytXdq2cyFeTbc9FILz00,1581411241119,"",""
TP-dBd2NKk9zF1p09lT38ho90YJsqW2b94qf5Y1PN8b9V,An toàn thực phẩm,,1,TP-dBd2NKk9zF1p09lT38ho90YJsqW2b94qf5Y1PN8b9V,1581411210111,"",""
TP-dLoUSEk810sOPzTW5t16iNXUzFeNBONSIsg71vZnc9,Thần kinh,,1,TP-dLoUSEk810sOPzTW5t16iNXUzFeNBONSIsg71vZnc9,1581411251096,"",""
TP-dPmaJkUsBTaFf5q09mo36RXU1A57BqCjT6a90luYMG,Nữ giới,,1,TP-dPmaJkUsBTaFf5q09mo36RXU1A57BqCjT6a90luYMG,1581411235910,"",""
TP-GKpm0S6864VMD8MEE08eJ7qsXxoYgo6sC6FQ1HU5DU,Nam giới,,1,TP-GKpm0S6864VMD8MEE08eJ7qsXxoYgo6sC6FQ1HU5DU,1581411230724,"",""
TP-H02I4fQ4WMm3s3BFlZ2Ccni5NmNs1145kVZCWItD3P,Da liễu,,1,TP-H02I4fQ4WMm3s3BFlZ2Ccni5NmNs1145kVZCWItD3P,1581411215459,"",""
TP-hi0l84cdNdT0pmFrehXkX2VvrurNww4zm1Y2S2LxCg,Dinh dưỡng,,1,TP-hi0l84cdNdT0pmFrehXkX2VvrurNww4zm1Y2S2LxCg,1581411217950,"",""
TP-HwqOVVkvIt18oLJVWk1v683zlarQJwPBLz0HHuHd8C,Huyết học,,1,TP-HwqOVVkvIt18oLJVWk1v683zlarQJwPBLz0HHuHd8C,1581411220505,"",""
TP-izkL313lDwnf7LPUcyd7Lv5nBaWbG6sZAIvwVDAXR2,Răng,,1,TP-izkL313lDwnf7LPUcyd7Lv5nBaWbG6sZAIvwVDAXR2,1581411238401,"",""
TP-ji9AoH4dmj7OSOIg4q7CB7FuadF4v6zZLmIJZcAMts,Thuốc và thực phẩm,,1,TP-ji9AoH4dmj7OSOIg4q7CB7FuadF4v6zZLmIJZcAMts,1581411248604,"",""
TP-K8o4j9P353UojCeEerfGGLsq42ruAiN99QyYAzNZYu,Thuốc,,1,TP-K8o4j9P353UojCeEerfGGLsq42ruAiN99QyYAzNZYu,1581411246118,"",""
TP-MGb5V1n0l0O9nr0jKJ0150OdaFz71lFwdxj0lT0iZh,Nhi,,1,TP-MGb5V1n0l0O9nr0jKJ0150OdaFz71lFwdxj0lT0iZh,1581411233213,"",""
TP-MgW7UZrbbTcI2CjU4V7GCxdHO8nUv5H9ZNWI2BObGu,Tâm thần,,1,TP-MgW7UZrbbTcI2CjU4V7GCxdHO8nUv5H9ZNWI2BObGu,1581411261128,"",""
TP-riAxk9FBOhmth99dImrgNoD9r9zuGiELBTOKvIXlMd,Hô hấp,,1,TP-riAxk9FBOhmth99dImrgNoD9r9zuGiELBTOKvIXlMd,1581411223010,"",""
TP-vEXWfsB8BA4AK6Y7at3YsTjoZaZ6JDC635bBYhtDbC,Khỏe đẹp,,1,TP-vEXWfsB8BA4AK6Y7at3YsTjoZaZ6JDC635bBYhtDbC,1581411225731,"",""
TP-wycybdK378eqc6C6RjAdDh3M5snIYiTJp8F3F55zNC,Tiêu hóa,,1,TP-wycybdK378eqc6C6RjAdDh3M5snIYiTJp8F3F55zNC,1581411258597,"",""
This diff is collapsed.
This diff is collapsed.
category_id,category_name,deleted,parent_id
0,An toàn thực phẩm,0,NULL
1,Cơ xương khớp,0,NULL
2,Da liễu,0,NULL
3,Dinh dưỡng,0,NULL
4,Hô hấp,0,NULL
5,Huyết học,0,NULL
6,Khỏe đẹp,0,NULL
7,Mắt,0,NULL
8,Nam giới,0,NULL
9,Nhi,0,NULL
10,Nữ giới,0,NULL
11,Răng,0,NULL
12,Sản phụ khoa,0,NULL
13,Tai mũi họng,0,NULL
14,Tâm thần,0,NULL
15,Thần kinh,0,NULL
16,Thận niệu,0,NULL
17,Thuốc,0,NULL
18,Thuốc và thực phẩm,0,NULL
20,Tiêu hóa,0,NULL
21,Tim mạch,0,NULL
22,Ung thư,0,NULL
23,Y học cổ truyền,0,NULL
start_url_id,using_next_button,max_no_pages,start_url,deleted,domain_id
1eb174d5-2295-4f1d-a1c6-6baca20435d3,1,NULL,https://thoidai.com.vn/khoe-dep,0,f1f0f18b-2b5b-4b4e-8ab5-b9b47dcf8d68
1effa4e5-b469-4d5a-8d6d-6240e31de260,1,NULL,https://www.doisongphapluat.com/doi-song/suc-khoe-lam-dep/,0,3ff553db-beb8-4f49-82a7-363e287d5ac1
2a616e01-a7e0-4d7f-8865-8d65e191712a,1,NULL,https://vtv.vn/suc-khoe/benh-hiem-ngheo.htm,0,80f2ced5-c09a-4133-9bcd-d3233a46cabc
2a6974b7-66f9-4b75-898c-6a0253e1e178,0,29,https://viettimes.vn/api/morenews-zonepage-120-___NUM___.html,0,5d55a074-1844-486f-9974-7944b08b3e4d
3486d9a8-4b02-43b3-a2a6-cb12b61a85bf,1,NULL,https://kienthuc.net.vn/khoe-dep/,0,9a6d2e58-f13a-4d95-8fb2-007086bee893
35a113da-8ffe-4775-9ba3-8e6ee11cdb14,1,NULL,https://phunuvietnam.vn/ky-nang/khoe.html,0,c15fc089-6387-41a5-b972-9546cf0ebe47
3ae04cfb-2ef5-4ae4-b55d-5852e095b96e,0,50,https://news.zing.vn/suc-khoe/trang___NUM___.html,0,ea03354e-eb11-4c53-b802-a7c2efa95a52
3b7720f7-ebb9-4608-a16a-5e818ccc8cec,1,NULL,https://baotintuc.vn/suc-khoe-564ct0.htm,0,4829d56c-17c0-4dd3-b12c-95359a402f9e
3c5e691f-52b8-45cb-b72d-5e3448a6a4e9,0,NULL,https://vietnamnet.vn/vn/suc-khoe/trang___NUM___/,0,c91d565a-7cdb-4879-bbec-7208053a9053
47d476bb-c4b3-4291-929e-79405aa9ba7f,1,NULL,http://baochinhphu.vn/Suc-khoe/450.vgp,0,753dbe23-45a7-4dcf-bef1-96b229ff5ac8
4c2da900-a72d-48ae-9d2d-11e3eb0eb009,1,NULL,https://www.baogiaothong.vn/y-te/,0,fdc0f3fc-ec66-4183-8656-d94bb20ad763
50159ff6-f120-4e8a-9cc0-72506c963568,1,NULL,https://vtv.vn/suc-khoe/song-khoe.htm,0,80f2ced5-c09a-4133-9bcd-d3233a46cabc
51c85639-ec76-466b-809c-0129e4be30e1,1,NULL,https://kienthuctieuduong.vn/tin-tuc/?PageSpeed=noscript,0,5dc665e1-44bf-480e-bbf2-ce599418b604
56be19a5-629a-4b70-9a86-7120af7703a4,1,NULL,https://vov.vn/suc-khoe/,0,c4aeb921-fdc3-481c-a9a6-428628d24ba2
73830a8a-2579-4a22-9106-d0ab1ed61287,0,NULL,https://laodong.vn/suc-khoe?page=___NUM___,0,d15b1b98-149b-466b-b5f6-abe5246e091c
7ab915b6-43b6-40b2-be29-3d16893854f4,1,NULL,https://www.tienphong.vn/suc-khoe/,0,e248f4f2-0e87-4f1c-b289-f02259d33377
87421472-a819-420a-9498-6bfe23d4d804,1,NULL,https://infonet.vietnamnet.vn/suc-khoe-1190.info,0,730d3915-543e-451f-8480-f3d60702f2df
8ac0c0fc-6bb7-422a-9899-c92daad1c42f,1,NULL,https://thanhnien.vn/suc-khoe/,0,14c39a0e-11f8-4710-bd8c-14d768b409df
8c10545b-afaf-4618-8efd-4d857dfb0c70,1,NULL,https://dantri.com.vn/suc-khoe.htm,0,8d7f02f0-0593-4696-a777-d1acc43a12e9
90eaaef5-6a47-4344-b1aa-b8d9c58d7c37,0,50,https://nld.com.vn/loadmorecategory-1050-___NUM___.htm,0,5e12202c-e4ba-4b4c-a0ae-da299c1af317
9a6553da-3be3-443c-9e73-efc83033f856,1,NULL,https://vtv.vn/song-khoe/duoc-lieu-dan-gian.htm,0,80f2ced5-c09a-4133-9bcd-d3233a46cabc
a2b240d1-4f46-404a-ae51-70ccef1c97fa,1,NULL,https://www.qdnd.vn/xa-hoi/y-te,0,e19346f3-2ea2-4eca-9b44-2f8643701044
ac7086df-e585-4128-95a0-fdf5b082ada5,1,NULL,https://vnexpress.net/suc-khoe,0,927ee234-5f91-4ca7-bb48-e6b4d8fd0303
b172822a-c103-4493-b822-36d77c94e567,0,NULL,https://tuoitre.vn/timeline/12/trang-___NUM___.htm,0,194d6fec-782e-4df0-a02d-e811a4cd2208
bcb4fd00-73a8-43dd-a0a1-811b2b1fbe06,1,NULL,https://vtv.vn/suc-khoe/tieu-diem.htm,0,80f2ced5-c09a-4133-9bcd-d3233a46cabc
bcf87ac9-46cb-41de-a50b-368e3b2aee9a,1,NULL,https://www.nhandan.com.vn/y-te,0,2722ed98-6ca9-4f2c-94a4-00785be521b9
c1f25a5e-8dbe-42a7-808a-70ddf3d9d208,1,NULL,https://ngaydautien.vn/dai-thao-duong/tin-tuc,0,3fdb6e60-ff72-48c8-a5ac-4e2d179bea86
c3345bff-5aea-439e-986e-905d0ecea1a5,1,NULL,https://www.dkn.tv/cat/suc-khoe,0,e98964a8-8610-4b9b-b3fe-f09861a4e867
cb89a470-bbd2-4675-85f4-dc4874fa70bb,1,NULL,https://vtv.vn/suc-khoe/y-hoc-the-gioi.htm,0,80f2ced5-c09a-4133-9bcd-d3233a46cabc
d631c7e4-eeb8-42a9-8d6e-c261de4cb93c,1,NULL,https://plo.vn/suc-khoe/,0,af268eab-f83c-4b8a-982a-a6383c783837
d79cfe1f-0e55-4528-9a10-05eff4c65664,1,NULL,https://www.24h.com.vn/suc-khoe-doi-song-c62.html,0,fc8ae873-c1be-49f3-b9a1-10ee06b94fa7
f444e4e9-f504-4a82-9560-df7366960d00,1,NULL,https://doisongvietnam.vn/suc-khoe.html,0,ecbaa769-6a7b-42a4-8fc0-c317d59718f0
from uuid import uuid4
from sqlalchemy.orm import Session as AlSession
import pandas as pd
import numpy as np
from treelib import Node, Tree
from mysql.engine import connect
from mysql.entities import *
from utils.read_data import load_jsonl_file
from config import mysql_settings
Session = connect(
host=mysql_settings.HOST,
port=mysql_settings.PORT,
username=mysql_settings.USERNAME,
password=mysql_settings.PASSWORD,
db=mysql_settings.DATABASE
)
DOMAIN_FN = 'init_data/domains_11012020.json'
CATEGORY_FN = 'init_data/categories.csv'
def add_domain(domain_fn, start_url_fn):
session: AlSession = Session()
domain_df = pd.read_csv(domain_fn, sep=',')
start_url_df = pd.read_csv(start_url_fn, sep=',')
start_url_df.max_no_pages.fillna(0, inplace=True)
domains = [
Domain(
domain_id=domain.domain_id,
domain_name=domain.domain_name,
next=domain.next if domain.next is not np.nan else None,
url=domain.url,
thumbnail=domain.thumbnail,
title=domain.title,
summary=domain.summary if domain.summary is not np.nan else '',
content=domain.content,
tags=domain.tags if domain.tags is not np.nan else None,
time=domain.time if domain.time is not np.nan else None,
)
for domain in domain_df.itertuples()
]
start_urls = [
StartUrl(
start_url_id=start_url.start_url_id,
using_next_button=start_url.using_next_button,
max_no_pages=start_url.max_no_pages if start_url.max_no_pages > 0 else None,
start_url=start_url.start_url,
domain_id=start_url.domain_id
)
for start_url in start_url_df.itertuples()
]
session.add_all(domains)
session.add_all(start_urls)
session.commit()
def add_categories(input_fn):
session: AlSession = Session()
data = pd.read_csv(input_fn, sep=',')
data.fillna('', inplace=True)
tree = Tree()
tree.create_node(tag='Root', identifier='root')
for category in data.itertuples():
tree.create_node(
tag=category.topic_name,
identifier=category.topic_id,
parent=category.parent if category.parent else 'root',
data=Category(
category_id=category.topic_id,
category_name=category.topic_name,
parent_id=category.parent if category.parent else None
)
)
# tree.show()
categories = []
for node in tree.expand_tree():
if tree[node].data:
categories.append(tree[node].data)
session.add_all(categories)
session.commit()
import snappy
import json
from .kafka_consumer import Consumer
def consume(topic, bootstrap_servers, callback):
consumer = Consumer(topic=topic, bootstrap_servers=bootstrap_servers)
for message in consumer.get_consumer():
str_message = snappy.decompress(message.value).decode('utf-8')
data = json.loads(str_message)
if type(data) == list:
callback(data)
elif type(data) == dict:
callback([data])
from kafka import KafkaAdminClient
from kafka.admin import NewTopic
import time
class KafkaAdmin:
def __init__(self, bootstrap_servers):
self.connection = KafkaAdminClient(
bootstrap_servers=bootstrap_servers
)
def close(self):
self.connection.close()
def create_topic(self, name, num_partitions=3, replication_factor=1):
new_topic = NewTopic(
name=name,
num_partitions=num_partitions,
replication_factor=replication_factor
)
res = self.connection.create_topics([new_topic])
print(res)
def delete_topic(self, topic_name):
res = self.connection.delete_topics([topic_name])
time.sleep(120)
print(res)
from kafka import KafkaConsumer
class Consumer:
def __init__(self,
topic,
bootstrap_servers=None,
auto_offset_reset='earliest',
enable_auto_commit=True,
group_id=None):
self.consumer = KafkaConsumer(
topic,
bootstrap_servers=bootstrap_servers if bootstrap_servers else ['localhost:9092'],
auto_offset_reset=auto_offset_reset,
enable_auto_commit=enable_auto_commit,
group_id=group_id,
)
def get_consumer(self):
return self.consumer
from kafka import KafkaProducer
import snappy
class Producer:
def __init__(self, bootstrap_servers):
self.producer = KafkaProducer(bootstrap_servers=bootstrap_servers, api_version=(0, 10))
def send_message(self, topic, value=None, key=None, headers=None, partition=None, timestamp_ms=None):
self.producer.send(topic,
value=snappy.compress(value),
key=key, headers=headers,
partition=partition,
timestamp_ms=timestamp_ms)
def close(self):
self.producer.close()
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment