如何使用下载的LDA模型文件进行主题分析
BOW方法 (词袋模型)
lda_model_{index}.model
LDA主题模型
dictionary_{index}.dict
词典文件
1. 安装依赖
pip install gensim jieba
2. 加载模型
from gensim.models import LdaModel
from gensim import corpora
# 加载模型
lda_model = LdaModel.load('lda_model_0.model')
dictionary = corpora.Dictionary.load('dictionary_0.dict')
3. 查看主题
# 显示所有主题
for topic_id in range(lda_model.num_topics):
topic = lda_model.show_topic(topic_id, topn=10)
print(f"主题 {topic_id + 1}:")
for word, prob in topic:
print(f" {word}: {prob:.3f}")
4. 分析新文档
import jieba
# 预处理新文档
def preprocess_text(text):
words = list(jieba.cut(text))
return [word for word in words if len(word) > 1]
# 分析文档主题
new_text = "您的新文档内容"
new_doc = preprocess_text(new_text)
new_bow = dictionary.doc2bow(new_doc)
doc_topics = lda_model.get_document_topics(new_bow)
for topic_id, prob in doc_topics:
print(f"主题 {topic_id + 1}: {prob:.3f}")
Word2Vec方法
word2vec_model_{index}.model
Word2Vec词向量模型
topic_model_{index}.pkl
主题信息数据
1. 安装依赖
pip install gensim scikit-learn jieba numpy
2. 加载模型
from gensim.models import Word2Vec
import pickle
import numpy as np
# 加载模型
w2v_model = Word2Vec.load('word2vec_model_0.model')
with open('topic_model_0.pkl', 'rb') as f:
topic_data = pickle.load(f)
3. 查看主题
# 查看主题词
topic_words = topic_data['topic_words']
for i, topic in enumerate(topic_words):
print(f"主题 {i + 1}:")
for word, score in topic[:10]:
print(f" {word}: {score:.3f}")
4. 词向量分析
# 查找相似词
word = "示例词"
if word in w2v_model.wv:
similar_words = w2v_model.wv.most_similar(word, topn=10)
print(f"与'{word}'相似的词:")
for sim_word, similarity in similar_words:
print(f" {sim_word}: {similarity:.3f}")
5. 分析新文档
from sklearn.metrics.pairwise import cosine_similarity
import jieba
def analyze_document(text):
# 分词
words = [word for word in jieba.cut(text) if len(word) > 1]
# 计算文档向量
doc_vectors = []
for word in words:
if word in w2v_model.wv:
doc_vectors.append(w2v_model.wv[word])
if not doc_vectors:
return None
doc_vector = np.mean(doc_vectors, axis=0)
# 计算主题相似度
topic_vectors = topic_data['topic_vectors']
similarities = []
for topic_vector in topic_vectors:
sim = cosine_similarity([doc_vector], [topic_vector])[0][0]
similarities.append(max(0, sim))
# 归一化
total = sum(similarities)
if total > 0:
probs = [sim / total for sim in similarities]
else:
probs = [1/len(similarities)] * len(similarities)
return probs
# 使用示例
new_text = "您的新文档内容"
result = analyze_document(new_text)
if result:
for i, prob in enumerate(result):
print(f"主题 {i + 1}: {prob:.3f}")
注意事项
- • 分析新文档时必须使用相同的预处理方式
- • 确保安装正确版本的依赖包
- • 文件路径需要根据实际下载位置调整
- • BOW方法更适合主题解释,Word2Vec方法更关注词语义相似性