import time
import numpy as np
import math
wordHash = {}
wordNum = 0
window = 2
words = []
vecSize = 100
u = 0.1
t = 500
#將單詞存入map
def read_file():
global wordNum,wordHash,words
f =open("test.txt",encoding="utf-8")
sentences = f.readlines()
for sentence in sentences:
words = sentence.split(" ")
for word in words:
if word in wordHash:
wordHash[word] += 1
else:
wordHash[word] = 1
wordNum+=1
#對單詞構建哈弗曼編碼
def buildHFMTree():
global wordHash
vocab = sorted(wordHash.items(), key=lambda item: item[1], reverse=True)
length = len(vocab)*2
weight = [None]*length
parent = [None]*length
pos = [None]*length
for i in range(length):
if i < length/2:
wordHash[vocab[i][0]] = i
weight[i] = vocab[i][1]
else:
weight[i] = wordNum
lp = len(vocab)-1
rp = lp+1
addp = lp+1
while True:
if lp<0:
if rp+1==addp:
break
weight[addp] = weight[rp]+weight[rp+1]
pos[rp] = 0
pos[rp+1] = 1
parent[rp] = addp
parent[rp+1] = addp
addp+=1
rp+=2
continue
if weight[lp] < weight[rp]:
if lp-1>=0 and weight[lp-1] < weight[rp]:
min = lp
max = lp-1
lp = lp-2
else:
min = lp
max = rp
lp-=1
rp+=1
else:
if weight[rp+1] > weight[lp]:
min = rp
max = lp
lp -= 1
rp+=1
else:
min = rp
max = rp+1
rp+=2
weight[addp] = weight[min]+weight[max]
pos[min] = 0
pos[max] = 1
parent[min] = addp
parent[max] = addp
addp+=1
return pos,parent
def sigmiod(n):
return np.exp(n)/(1+np.exp(n))
def getHFMCode(word,pos,parent):
global wordHash
i = wordHash[word]
code = []
while parent[i]!=None:
code.append(pos[i])
i = parent[i]
print("單詞'"+word+"'的哈弗曼編碼:"+str(code))
return code
def updataParam(word,pos,parent,ansVec,projVec,paramVec):
global wordHash
i = wordHash[word]
ll = 0
paramChange = np.zeros((vecSize,wordNum-1))
projChange = np.zeros((vecSize))
while parent[i] != None:
d = pos[i]
n = ansVec[parent[i]-wordNum]
try:
ll += (1-d)*math.log(sigmiod(n))+d*math.log(1-sigmiod(n))
except Exception:
ll += 0
m = 1-d-sigmiod(n)
gradProj = m*projVec
gradParam = m * paramVec[:,parent[i] - wordNum]
paramChange[:,parent[i]-wordNum] += gradProj*u
projChange += gradParam*u
i = parent[i]
return projChange,paramChange,ll
def initVec():
global wordNum,vecSize
wordVec = np.random.random((wordNum,vecSize))
paramVec = np.zeros((vecSize,wordNum-1))
for i in range(wordNum):
for j in range(vecSize):
wordVec[i][j] = (wordVec[i][j]-0.5)/vecSize
return wordVec,paramVec
def train(wordVec,paramVec,pos,parent):
global vecSize
for k in range(t+1):
paramChange = np.zeros((vecSize, wordNum - 1))
wordChange = np.zeros((wordNum,vecSize))
for i in range(len(words)):
projVec = np.zeros(vecSize)
n = 0
for j in range(i-window,i):
if j<0:
continue
projVec += wordVec[wordHash[words[j]]]
n+=1
for j in range(i+1,i+window):
if j>=len(words):
continue
projVec += wordVec[wordHash[words[j]]]
n+=1
projVec = projVec/n
ansVec = projVec.dot(paramVec)
projChange1,paramChange1,ll = updataParam(words[i],pos,parent,ansVec,projVec,paramVec)
for j in range(i - window, i):
if j < 0:
continue
wordChange[wordHash[words[j]]]+=projChange1
for j in range(i + 1, i + window):
if j >= len(words):
continue
wordChange[wordHash[words[j]]]+=projChange1
paramChange+=paramChange1
if k%100==0:
print("第"+str(k)+"輪訓練中,單詞"+words[i]+"的損失爲:"+str(ll))
wordVec+=wordChange
paramVec+=paramChange
print("開始讀取單詞")
read_file()
print("讀取單詞結束")
print("開始構建哈弗曼樹")
pos,parent = buildHFMTree()
print("構建完成")
#getHFMCode('interpretations',pos,parent)
print("開始初始化單詞向量")
wordVec,paramVec = initVec()
print("單詞向量初始完成")
print("準備訓練參數")
train(wordVec,paramVec,pos,parent)
語料:
In the near future the translation history will only be viewable when you log in to your account and it will be centrally managed in my activity record. This upgrade will clear the previous history so if you want the system to record certain translations for future review please be sure to save the translation results
執行結果:
不足
相對於原始碼,未實現負採樣,多執行緒和指數運算近似來降低運算的功能