6

文本挖掘与NLP笔记——代码向:分词 - 孤飞

 1 year ago
source link: https://www.cnblogs.com/ranxi169/p/16833336.html
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.
neoserver,ios ssh client

文本挖掘与NLP笔记——代码向:分词

分词:jieba.cut

words = jieba.cut("我来到北京大学",cut_all=True)
print('全模式:'+'/'.join([w for w in words])) #全模式

words = jieba.cut("我来到北京大学",cut_all=False)
print('精确模式:'+'/'.join([w for w in words])) #精确模式,默认

words = jieba.cut_for_search("小明毕业于北京大学,后在美国哈佛大学深造")
print('/'.join([w for w in words])) #搜索引擎模式,在精确模式的基础上,对长词在此划分

全模式:我/来到/北京/北京大学/大学
精确模式:我/来到/北京大学
小明/毕业/于/北京/大学/北京大学/,/后/在/美国/哈佛/大学/美国哈佛大学/深造

请练习添加自定义词典

词性:jieba.posseg

import jieba.posseg as pg

for word, flag in pg.cut("你想去学校填写学生寒暑假住校申请表吗?"):
    print('%s %s' % (word, flag))

'你/学校/填写/学生/寒暑假/住校/申请表'

分词引入停用词

import jieba
import pandas as pd
import numpy as np

paths = '中英文停用词.xlsx'
dfs = pd.read_excel(paths,dtype=str)

stopwords = ['想','去','吗','?']

words = jieba.cut("你想去学校填写学生寒暑假住校申请表吗?")
'/'.join([w for w in words if (w not in stopwords)])#此处’/'表示换行

'你/学校/填写/学生/寒暑假/住校/申请表'

txt转dataframe函数

import random
import jieba.posseg as pg
import pandas as pd
import numpy as np

def generatorInfo(file_name):
    # 读取文本文件
    with open(file_name, encoding='utf-8') as file:
        line_list = [k.strip() for k in file.readlines()]
        data = []
        for k in random.sample(line_list,1000):
            t = k.split(maxsplit=1)
            #data_label_list.append(t[0])
            #data_content_list.append(t[1])
            data.append([t[0],' '.join([w for w,flag in pg.cut(t[1]) if (w not in dfs['stopwords']) and (w !=' ') and (len(w)>=2)])])
    return data

file_name = 'cnews.train.txt'
df = pd.DataFrame(np.array(generatorInfo(file_name)),columns=['类别','分词'])
path = '训练集分词结果(随机选取1000个样本).xlsx'
df.to_excel(path,index=False)
df

image-20221027180826071

词云图:wordcloud

%pylab inline
import matplotlib.pyplot as plt
from wordcloud import WordCloud

text = ' '.join(list(df['分词']))
wcloud = WordCloud(
    font_path='simsun.ttc', #字体路径
    background_color='white', #指定背景颜色
    max_words=500,   #词云显示最大词数
    max_font_size=150,  #指定最大字号
    #mask = mask #背景图片
) 

wcloud = wcloud.generate(text)  #生成词云
plt.imshow(wcloud)
plt.axis('off')
plt.show()

image-20221027181109493

提取关键词:jieba.analyse.extract_tags

import jieba.analyse
import pandas as pd
import numpy as np

path = '训练集分词结果(随机选取1000个样本).xlsx'
df = pd.read_excel(path,dtype=str)
s = ' '.join(list(df['分词']))
for w,x in jieba.analyse.extract_tags(s,withWeight=True):
    print('%s %s' % (w,x))

image-20221027182149499
请练习基于TextRank算法抽取关键词
import jieba.analyse
import pandas as pd
import numpy as np

path = '训练集分词结果(随机选取1000个样本).xlsx'
df = pd.read_excel(path,dtype=str)
tag = list(set(list(df['类别'])))

for t in tag:
    s = ' '.join(list(df[df['类别']==t]['分词']))
    print(t)
    for w,x in jieba.analyse.extract_tags(s,withWeight=True):
        print('%s %s' % (x,w))
    

image-20221027182548776

构建词向量

构建词向量简单的有两种分别是TfidfTransformer和 CountVectorizer

#CountVectorizer会将文本中的词语转换为词频矩阵
from sklearn.feature_extraction.text import CountVectorizer
path = '训练集分词结果(随机选取1000个样本).xlsx'
df = pd.read_excel(path,dtype=str)
corpus = df['分词']
#vectorizer = CountVectorizer(max_features=5000) 
vectorizer = CountVectorizer() 
X = vectorizer.fit_transform(corpus)
print(X)

image-20221027183001731
from sklearn.feature_extraction.text import TfidfTransformer
import datetime

starttime = datetime.datetime.now()
transformer = TfidfTransformer()  
tfidf = transformer.fit_transform(X)
word = vectorizer.get_feature_names()  
weight = tfidf.toarray() 
print(weight)

image-20221027183320162

词语分类:人工vsKmeans

from sklearn.cluster import KMeans

starttime = datetime.datetime.now()

path = '训练集分词结果(随机选取1000个样本).xlsx'
df = pd.read_excel(path,dtype=str)
corpus = df['分词']

kmeans=KMeans(n_clusters=10)   #n_clusters:number of cluster  
kmeans.fit(weight)
res = [list(df['类别']),list(kmeans.labels_)]
df_res = pd.DataFrame(np.array(res).T,columns=['人工分类','Kmeans分类'])
path_res = 'Kmeans自动分类结果.xlsx'
df_res.to_excel(path_res,index=False)
df_res

image-20221027184227663
path = 'Kmeans自动分类结果.xlsx'
df = pd.read_excel(path,dtype=str)

df['计数'] = [1 for m in range(len(df['人工分类']))]
df1 = pd.pivot_table(df, index=['人工分类'], columns=['Kmeans分类'], values=['计数'], aggfunc=np.sum, fill_value=0)
co = ['人工分类']
co.extend(list(df1['计数'].columns))
df1 = df1.reset_index()
df2 = pd.DataFrame((np.array(df1)),columns=co)

path_res = '人工与Kmeans分类结果对照.xlsx'
df2.to_excel(path_res,index=False)

df2

image-20221027184308183
import random

def is_contain_chinese(check_str):
    for ch in check_str:
        if u'\u4e00' <= ch <= u'\u9fff':
            return 1
    return 0

def generatorInfo(file_name):
    """
    batch_size:生成数据的batch size
    seq_length:输入文字序列长度
    num_classes:文本的类别数
    file_name:读取文件的路径
    """
    
    # 读取文本文件
    with open(file_name, encoding='utf-8') as file:
        line_list = [k.strip() for k in file.readlines()]
        #data_label_list = []   # 创建数据标签文件
        #data_content_list = []   # 创建数据文本文件
        data = []
        for k in random.sample(line_list,1000):
            t = k.split(maxsplit=1)
            #data_label_list.append(t[0])
            #data_content_list.append(t[1])
            data.append([t[0],' '.join([w for w,flag in jieba.posseg.cut(t[1]) if (w not in dfs['stopwords']) and (w !=' ') and (flag not in ["nr","ns","nt","nz","m","f","ul","l","r","t"]) and (len(w)>=2)])])
            
    return data

#导入中文停用词表
paths = '中英文停用词.xlsx'
dfs = pd.read_excel(paths,dtype=str)

file_name = 'cnews.train.txt'
df = pd.DataFrame(np.array(generatorInfo(file_name)),columns=['类别','分词'])
df

image-20221027184449074
import random
import jieba
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfTransformer

def is_contain_chinese(check_str):
    for ch in check_str:
        if u'\u4e00' <= ch <= u'\u9fff':
            return 1
    return 0

def generatorInfo(file_name):
    """
    batch_size:生成数据的batch size
    seq_length:输入文字序列长度
    num_classes:文本的类别数
    file_name:读取文件的路径
    """
    
    # 读取文本文件
    with open(file_name, encoding='utf-8') as file:
        line_list = [k.strip() for k in file.readlines()]
        #data_label_list = []   # 创建数据标签文件
        #data_content_list = []   # 创建数据文本文件
        data = []
        for k in random.sample(line_list,1000):
            t = k.split(maxsplit=1)
            #data_label_list.append(t[0])
            #data_content_list.append(t[1])
            data.append([t[0],' '.join([w for w,flag in jieba.posseg.cut(t[1]) if (w not in dfs['stopwords']) and (w !=' ') and (flag not in ["nr","ns","nt","nz","m","f","ul","l","r","t"]) and (len(w)>=2)])])
            
    return data

#导入中文停用词表
paths = '中英文停用词.xlsx'
dfs = pd.read_excel(paths,dtype=str)

file_name = 'cnews.train.txt'
df = pd.DataFrame(np.array(generatorInfo(file_name)),columns=['类别','分词'])

#统计词频
corpus = df['分词'] #语料中的单词以空格隔开
#vectorizer = CountVectorizer(max_features=5000) 
vectorizer = CountVectorizer() 
X = vectorizer.fit_transform(corpus)

#文本向量化
transformer = TfidfTransformer()  
tfidf = transformer.fit_transform(X)
word = vectorizer.get_feature_names()  
weight = tfidf.toarray()

kmeans=KMeans(n_clusters=10)   #n_clusters:number of cluster  
kmeans.fit(weight)

res = [list(df['类别']),list(kmeans.labels_)]
df_res = pd.DataFrame(np.array(res).T,columns=['人工分类','Kmeans分类'])

df_res['计数'] = [1 for m in range(len(df_res['人工分类']))]
df1 = pd.pivot_table(df_res, index=['人工分类'], columns=['Kmeans分类'], values=['计数'], aggfunc=np.sum, fill_value=0)
co = ['人工分类']
co.extend(list(df1['计数'].columns))
df1 = df1.reset_index()
df2 = pd.DataFrame((np.array(df1)),columns=co)
df2

image-20221027184613552
df['Kmeans分类'] = df_res['Kmeans分类']
df

image-20221027184700276

About Joyk


Aggregate valuable and interesting links.
Joyk means Joy of geeK