6
文本挖掘与NLP笔记——代码向:分词 - 孤飞
source link: https://www.cnblogs.com/ranxi169/p/16833336.html
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.
文本挖掘与NLP笔记——代码向:分词
分词:jieba.cut
words = jieba.cut("我来到北京大学",cut_all=True)
print('全模式:'+'/'.join([w for w in words])) #全模式
words = jieba.cut("我来到北京大学",cut_all=False)
print('精确模式:'+'/'.join([w for w in words])) #精确模式,默认
words = jieba.cut_for_search("小明毕业于北京大学,后在美国哈佛大学深造")
print('/'.join([w for w in words])) #搜索引擎模式,在精确模式的基础上,对长词在此划分
全模式:我/来到/北京/北京大学/大学
精确模式:我/来到/北京大学
小明/毕业/于/北京/大学/北京大学/,/后/在/美国/哈佛/大学/美国哈佛大学/深造
请练习添加自定义词典
词性:jieba.posseg
import jieba.posseg as pg
for word, flag in pg.cut("你想去学校填写学生寒暑假住校申请表吗?"):
print('%s %s' % (word, flag))
'你/学校/填写/学生/寒暑假/住校/申请表'
分词引入停用词
import jieba
import pandas as pd
import numpy as np
paths = '中英文停用词.xlsx'
dfs = pd.read_excel(paths,dtype=str)
stopwords = ['想','去','吗','?']
words = jieba.cut("你想去学校填写学生寒暑假住校申请表吗?")
'/'.join([w for w in words if (w not in stopwords)])#此处’/'表示换行
'你/学校/填写/学生/寒暑假/住校/申请表'
txt转dataframe函数
import random
import jieba.posseg as pg
import pandas as pd
import numpy as np
def generatorInfo(file_name):
# 读取文本文件
with open(file_name, encoding='utf-8') as file:
line_list = [k.strip() for k in file.readlines()]
data = []
for k in random.sample(line_list,1000):
t = k.split(maxsplit=1)
#data_label_list.append(t[0])
#data_content_list.append(t[1])
data.append([t[0],' '.join([w for w,flag in pg.cut(t[1]) if (w not in dfs['stopwords']) and (w !=' ') and (len(w)>=2)])])
return data
file_name = 'cnews.train.txt'
df = pd.DataFrame(np.array(generatorInfo(file_name)),columns=['类别','分词'])
path = '训练集分词结果(随机选取1000个样本).xlsx'
df.to_excel(path,index=False)
df
词云图:wordcloud
%pylab inline
import matplotlib.pyplot as plt
from wordcloud import WordCloud
text = ' '.join(list(df['分词']))
wcloud = WordCloud(
font_path='simsun.ttc', #字体路径
background_color='white', #指定背景颜色
max_words=500, #词云显示最大词数
max_font_size=150, #指定最大字号
#mask = mask #背景图片
)
wcloud = wcloud.generate(text) #生成词云
plt.imshow(wcloud)
plt.axis('off')
plt.show()
提取关键词:jieba.analyse.extract_tags
import jieba.analyse
import pandas as pd
import numpy as np
path = '训练集分词结果(随机选取1000个样本).xlsx'
df = pd.read_excel(path,dtype=str)
s = ' '.join(list(df['分词']))
for w,x in jieba.analyse.extract_tags(s,withWeight=True):
print('%s %s' % (w,x))
请练习基于TextRank算法抽取关键词
import jieba.analyse
import pandas as pd
import numpy as np
path = '训练集分词结果(随机选取1000个样本).xlsx'
df = pd.read_excel(path,dtype=str)
tag = list(set(list(df['类别'])))
for t in tag:
s = ' '.join(list(df[df['类别']==t]['分词']))
print(t)
for w,x in jieba.analyse.extract_tags(s,withWeight=True):
print('%s %s' % (x,w))
构建词向量
构建词向量简单的有两种分别是TfidfTransformer和 CountVectorizer
#CountVectorizer会将文本中的词语转换为词频矩阵
from sklearn.feature_extraction.text import CountVectorizer
path = '训练集分词结果(随机选取1000个样本).xlsx'
df = pd.read_excel(path,dtype=str)
corpus = df['分词']
#vectorizer = CountVectorizer(max_features=5000)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(X)
from sklearn.feature_extraction.text import TfidfTransformer
import datetime
starttime = datetime.datetime.now()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)
word = vectorizer.get_feature_names()
weight = tfidf.toarray()
print(weight)
词语分类:人工vsKmeans
from sklearn.cluster import KMeans
starttime = datetime.datetime.now()
path = '训练集分词结果(随机选取1000个样本).xlsx'
df = pd.read_excel(path,dtype=str)
corpus = df['分词']
kmeans=KMeans(n_clusters=10) #n_clusters:number of cluster
kmeans.fit(weight)
res = [list(df['类别']),list(kmeans.labels_)]
df_res = pd.DataFrame(np.array(res).T,columns=['人工分类','Kmeans分类'])
path_res = 'Kmeans自动分类结果.xlsx'
df_res.to_excel(path_res,index=False)
df_res
path = 'Kmeans自动分类结果.xlsx'
df = pd.read_excel(path,dtype=str)
df['计数'] = [1 for m in range(len(df['人工分类']))]
df1 = pd.pivot_table(df, index=['人工分类'], columns=['Kmeans分类'], values=['计数'], aggfunc=np.sum, fill_value=0)
co = ['人工分类']
co.extend(list(df1['计数'].columns))
df1 = df1.reset_index()
df2 = pd.DataFrame((np.array(df1)),columns=co)
path_res = '人工与Kmeans分类结果对照.xlsx'
df2.to_excel(path_res,index=False)
df2
import random
def is_contain_chinese(check_str):
for ch in check_str:
if u'\u4e00' <= ch <= u'\u9fff':
return 1
return 0
def generatorInfo(file_name):
"""
batch_size:生成数据的batch size
seq_length:输入文字序列长度
num_classes:文本的类别数
file_name:读取文件的路径
"""
# 读取文本文件
with open(file_name, encoding='utf-8') as file:
line_list = [k.strip() for k in file.readlines()]
#data_label_list = [] # 创建数据标签文件
#data_content_list = [] # 创建数据文本文件
data = []
for k in random.sample(line_list,1000):
t = k.split(maxsplit=1)
#data_label_list.append(t[0])
#data_content_list.append(t[1])
data.append([t[0],' '.join([w for w,flag in jieba.posseg.cut(t[1]) if (w not in dfs['stopwords']) and (w !=' ') and (flag not in ["nr","ns","nt","nz","m","f","ul","l","r","t"]) and (len(w)>=2)])])
return data
#导入中文停用词表
paths = '中英文停用词.xlsx'
dfs = pd.read_excel(paths,dtype=str)
file_name = 'cnews.train.txt'
df = pd.DataFrame(np.array(generatorInfo(file_name)),columns=['类别','分词'])
df
import random
import jieba
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfTransformer
def is_contain_chinese(check_str):
for ch in check_str:
if u'\u4e00' <= ch <= u'\u9fff':
return 1
return 0
def generatorInfo(file_name):
"""
batch_size:生成数据的batch size
seq_length:输入文字序列长度
num_classes:文本的类别数
file_name:读取文件的路径
"""
# 读取文本文件
with open(file_name, encoding='utf-8') as file:
line_list = [k.strip() for k in file.readlines()]
#data_label_list = [] # 创建数据标签文件
#data_content_list = [] # 创建数据文本文件
data = []
for k in random.sample(line_list,1000):
t = k.split(maxsplit=1)
#data_label_list.append(t[0])
#data_content_list.append(t[1])
data.append([t[0],' '.join([w for w,flag in jieba.posseg.cut(t[1]) if (w not in dfs['stopwords']) and (w !=' ') and (flag not in ["nr","ns","nt","nz","m","f","ul","l","r","t"]) and (len(w)>=2)])])
return data
#导入中文停用词表
paths = '中英文停用词.xlsx'
dfs = pd.read_excel(paths,dtype=str)
file_name = 'cnews.train.txt'
df = pd.DataFrame(np.array(generatorInfo(file_name)),columns=['类别','分词'])
#统计词频
corpus = df['分词'] #语料中的单词以空格隔开
#vectorizer = CountVectorizer(max_features=5000)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
#文本向量化
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)
word = vectorizer.get_feature_names()
weight = tfidf.toarray()
kmeans=KMeans(n_clusters=10) #n_clusters:number of cluster
kmeans.fit(weight)
res = [list(df['类别']),list(kmeans.labels_)]
df_res = pd.DataFrame(np.array(res).T,columns=['人工分类','Kmeans分类'])
df_res['计数'] = [1 for m in range(len(df_res['人工分类']))]
df1 = pd.pivot_table(df_res, index=['人工分类'], columns=['Kmeans分类'], values=['计数'], aggfunc=np.sum, fill_value=0)
co = ['人工分类']
co.extend(list(df1['计数'].columns))
df1 = df1.reset_index()
df2 = pd.DataFrame((np.array(df1)),columns=co)
df2
df['Kmeans分类'] = df_res['Kmeans分类']
df
Recommend
About Joyk
Aggregate valuable and interesting links.
Joyk means Joy of geeK