-
Notifications
You must be signed in to change notification settings - Fork 0
/
jieba_test.py
58 lines (45 loc) · 1.98 KB
/
jieba_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# 假设这是你的问题库
question_db = [
("我的瀚华工号是多少?", "你的瀚华工号是27897"),
("大x系统怎么给债务人设置债务减免?", "如果请按照一下步骤操作..."),
("大x系统怎么给债务人设置债务分期?", "如果请按照一下步骤操作..."),
("大x系统怎么登录?", "如果请按照一下步骤操作..."),
("瀚华系统怎么登录?", "如果请按照一下步骤操作..."),
# 添加更多问题及答案
]
# 使用jieba进行分词
def chinese_tokenizer(text):
return jieba.lcut(text)
# 初始化TF-IDF向量化器
vectorizer = TfidfVectorizer(tokenizer=chinese_tokenizer)
# 准备文本数据
texts = [q[0] for q in question_db]
# 训练TF-IDF模型
vectorizer.fit(texts)
def getAnswers(user_question):
# 将用户问题转换为向量
user_question_vec = vectorizer.transform([user_question])
highest_similarity = 0
best_match = None
# 遍历问题库,找到最佳匹配
for question, answer in question_db:
question_vec = vectorizer.transform([question])
similarity = cosine_similarity(user_question_vec, question_vec)[0][0]
if similarity > highest_similarity:
highest_similarity = similarity
best_match = (question, answer)
print(f"\033[94m用户问题: {user_question}\033[0m") # 蓝色
# 如果找到了相似度足够高的问题,返回答案
if best_match and highest_similarity > 0.6: # 假设阈值为0.6
# 绿色
print(f"\033[92m当前最匹配的问题: {best_match[0]}\n答案: {best_match[1]}\033[0m")
else:
print("\033[91m没有找到相关问题答案。\033[0m") # 红色
getAnswers('登录瀚华系统')
getAnswers('我的工号是多少?')
getAnswers('登录大x系统')
getAnswers('怎么给债务人设置分期')
getAnswers('怎么给债务人设置减免')