分詞:jieba.cut
words = jieba.cut("我來到北京大學",cut_all=True)
print('全模式:'+'/'.join([w for w in words])) #全模式
words = jieba.cut("我來到北京大學",cut_all=False)
print('精確模式:'+'/'.join([w for w in words])) #精確模式,預設
words = jieba.cut_for_search("小明畢業於北京大學,後在美國哈佛大學深造")
print('/'.join([w for w in words])) #搜尋引擎模式,在精確模式的基礎上,對長詞在此劃分
全模式:我/來到/北京/北京大學/大學
精確模式:我/來到/北京大學
小明/畢業/於/北京/大學/北京大學/,/後/在/美國/哈佛/大學/美國哈佛大學/深造
詞性:jieba.posseg
import jieba.posseg as pg
for word, flag in pg.cut("你想去學校填寫學生寒暑假住校申請表嗎?"):
print('%s %s' % (word, flag))
'你/學校/填寫/學生/寒暑假/住校/申請表'
分詞引入停用詞
import jieba
import pandas as pd
import numpy as np
paths = '中英文停用詞.xlsx'
dfs = pd.read_excel(paths,dtype=str)
stopwords = ['想','去','嗎','?']
words = jieba.cut("你想去學校填寫學生寒暑假住校申請表嗎?")
'/'.join([w for w in words if (w not in stopwords)])#此處’/'表示換行
'你/學校/填寫/學生/寒暑假/住校/申請表'
txt轉dataframe函數
import random
import jieba.posseg as pg
import pandas as pd
import numpy as np
def generatorInfo(file_name):
# 讀取文字檔案
with open(file_name, encoding='utf-8') as file:
line_list = [k.strip() for k in file.readlines()]
data = []
for k in random.sample(line_list,1000):
t = k.split(maxsplit=1)
#data_label_list.append(t[0])
#data_content_list.append(t[1])
data.append([t[0],' '.join([w for w,flag in pg.cut(t[1]) if (w not in dfs['stopwords']) and (w !=' ') and (len(w)>=2)])])
return data
file_name = 'cnews.train.txt'
df = pd.DataFrame(np.array(generatorInfo(file_name)),columns=['類別','分詞'])
path = '訓練集分詞結果(隨機選取1000個樣本).xlsx'
df.to_excel(path,index=False)
df
詞雲圖:wordcloud
%pylab inline
import matplotlib.pyplot as plt
from wordcloud import WordCloud
text = ' '.join(list(df['分詞']))
wcloud = WordCloud(
font_path='simsun.ttc', #字型路徑
background_color='white', #指定背景顏色
max_words=500, #詞雲顯示最大詞數
max_font_size=150, #指定最大字號
#mask = mask #背景圖片
)
wcloud = wcloud.generate(text) #生成詞雲
plt.imshow(wcloud)
plt.axis('off')
plt.show()
提取關鍵詞:jieba.analyse.extract_tags
import jieba.analyse
import pandas as pd
import numpy as np
path = '訓練集分詞結果(隨機選取1000個樣本).xlsx'
df = pd.read_excel(path,dtype=str)
s = ' '.join(list(df['分詞']))
for w,x in jieba.analyse.extract_tags(s,withWeight=True):
print('%s %s' % (w,x))
import jieba.analyse
import pandas as pd
import numpy as np
path = '訓練集分詞結果(隨機選取1000個樣本).xlsx'
df = pd.read_excel(path,dtype=str)
tag = list(set(list(df['類別'])))
for t in tag:
s = ' '.join(list(df[df['類別']==t]['分詞']))
print(t)
for w,x in jieba.analyse.extract_tags(s,withWeight=True):
print('%s %s' % (x,w))
構建詞向量
構建詞向量簡單的有兩種分別是TfidfTransformer和 CountVectorizer
#CountVectorizer會將文字中的詞語轉換為詞頻矩陣
from sklearn.feature_extraction.text import CountVectorizer
path = '訓練集分詞結果(隨機選取1000個樣本).xlsx'
df = pd.read_excel(path,dtype=str)
corpus = df['分詞']
#vectorizer = CountVectorizer(max_features=5000)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(X)
from sklearn.feature_extraction.text import TfidfTransformer
import datetime
starttime = datetime.datetime.now()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)
word = vectorizer.get_feature_names()
weight = tfidf.toarray()
print(weight)
詞語分類:人工vsKmeans
from sklearn.cluster import KMeans
starttime = datetime.datetime.now()
path = '訓練集分詞結果(隨機選取1000個樣本).xlsx'
df = pd.read_excel(path,dtype=str)
corpus = df['分詞']
kmeans=KMeans(n_clusters=10) #n_clusters:number of cluster
kmeans.fit(weight)
res = [list(df['類別']),list(kmeans.labels_)]
df_res = pd.DataFrame(np.array(res).T,columns=['人工分類','Kmeans分類'])
path_res = 'Kmeans自動分類結果.xlsx'
df_res.to_excel(path_res,index=False)
df_res
path = 'Kmeans自動分類結果.xlsx'
df = pd.read_excel(path,dtype=str)
df['計數'] = [1 for m in range(len(df['人工分類']))]
df1 = pd.pivot_table(df, index=['人工分類'], columns=['Kmeans分類'], values=['計數'], aggfunc=np.sum, fill_value=0)
co = ['人工分類']
co.extend(list(df1['計數'].columns))
df1 = df1.reset_index()
df2 = pd.DataFrame((np.array(df1)),columns=co)
path_res = '人工與Kmeans分類結果對照.xlsx'
df2.to_excel(path_res,index=False)
df2
import random
def is_contain_chinese(check_str):
for ch in check_str:
if u'\u4e00' <= ch <= u'\u9fff':
return 1
return 0
def generatorInfo(file_name):
"""
batch_size:生成資料的batch size
seq_length:輸入文字序列長度
num_classes:文字的類別數
file_name:讀取檔案的路徑
"""
# 讀取文字檔案
with open(file_name, encoding='utf-8') as file:
line_list = [k.strip() for k in file.readlines()]
#data_label_list = [] # 建立資料標籤檔案
#data_content_list = [] # 建立資料文字檔案
data = []
for k in random.sample(line_list,1000):
t = k.split(maxsplit=1)
#data_label_list.append(t[0])
#data_content_list.append(t[1])
data.append([t[0],' '.join([w for w,flag in jieba.posseg.cut(t[1]) if (w not in dfs['stopwords']) and (w !=' ') and (flag not in ["nr","ns","nt","nz","m","f","ul","l","r","t"]) and (len(w)>=2)])])
return data
#匯入中文停用詞表
paths = '中英文停用詞.xlsx'
dfs = pd.read_excel(paths,dtype=str)
file_name = 'cnews.train.txt'
df = pd.DataFrame(np.array(generatorInfo(file_name)),columns=['類別','分詞'])
df
彙總
import random
import jieba
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfTransformer
def is_contain_chinese(check_str):
for ch in check_str:
if u'\u4e00' <= ch <= u'\u9fff':
return 1
return 0
def generatorInfo(file_name):
"""
batch_size:生成資料的batch size
seq_length:輸入文字序列長度
num_classes:文字的類別數
file_name:讀取檔案的路徑
"""
# 讀取文字檔案
with open(file_name, encoding='utf-8') as file:
line_list = [k.strip() for k in file.readlines()]
#data_label_list = [] # 建立資料標籤檔案
#data_content_list = [] # 建立資料文字檔案
data = []
for k in random.sample(line_list,1000):
t = k.split(maxsplit=1)
#data_label_list.append(t[0])
#data_content_list.append(t[1])
data.append([t[0],' '.join([w for w,flag in jieba.posseg.cut(t[1]) if (w not in dfs['stopwords']) and (w !=' ') and (flag not in ["nr","ns","nt","nz","m","f","ul","l","r","t"]) and (len(w)>=2)])])
return data
#匯入中文停用詞表
paths = '中英文停用詞.xlsx'
dfs = pd.read_excel(paths,dtype=str)
file_name = 'cnews.train.txt'
df = pd.DataFrame(np.array(generatorInfo(file_name)),columns=['類別','分詞'])
#統計詞頻
corpus = df['分詞'] #語料中的單詞以空格隔開
#vectorizer = CountVectorizer(max_features=5000)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
#文字向量化
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)
word = vectorizer.get_feature_names()
weight = tfidf.toarray()
kmeans=KMeans(n_clusters=10) #n_clusters:number of cluster
kmeans.fit(weight)
res = [list(df['類別']),list(kmeans.labels_)]
df_res = pd.DataFrame(np.array(res).T,columns=['人工分類','Kmeans分類'])
df_res['計數'] = [1 for m in range(len(df_res['人工分類']))]
df1 = pd.pivot_table(df_res, index=['人工分類'], columns=['Kmeans分類'], values=['計數'], aggfunc=np.sum, fill_value=0)
co = ['人工分類']
co.extend(list(df1['計數'].columns))
df1 = df1.reset_index()
df2 = pd.DataFrame((np.array(df1)),columns=co)
df2
df['Kmeans分類'] = df_res['Kmeans分類']
df