python實現 word2vec

2020-08-08 20:54:28

參考兩篇部落格
基礎知識
原始碼解析

import time
import numpy as np
import math

wordHash = {}
wordNum = 0
window = 2
words = []
vecSize = 100
u = 0.1
t = 500

#將單詞存入map
def read_file():
    global wordNum,wordHash,words
    f =open("test.txt",encoding="utf-8")
    sentences = f.readlines()
    for sentence in sentences:
        words = sentence.split(" ")
        for word in words:
            if word in wordHash:
                wordHash[word] += 1
            else:
                wordHash[word] = 1
            wordNum+=1



#對單詞構建哈弗曼編碼
def buildHFMTree():
    global wordHash
    vocab = sorted(wordHash.items(), key=lambda item: item[1], reverse=True)
    length = len(vocab)*2
    weight = [None]*length
    parent = [None]*length
    pos = [None]*length
    for i in range(length):
        if i < length/2:
            wordHash[vocab[i][0]] = i
            weight[i] = vocab[i][1]
        else:
            weight[i] = wordNum
    lp = len(vocab)-1
    rp = lp+1
    addp = lp+1
    while True:
        if lp<0:
            if rp+1==addp:
                break
            weight[addp] = weight[rp]+weight[rp+1]
            pos[rp] = 0
            pos[rp+1] = 1
            parent[rp] = addp
            parent[rp+1] = addp
            addp+=1
            rp+=2
            continue
        if weight[lp] < weight[rp]:
            if lp-1>=0 and weight[lp-1] < weight[rp]:
                min = lp
                max = lp-1
                lp = lp-2
            else:
                min = lp
                max = rp
                lp-=1
                rp+=1
        else:
            if weight[rp+1] > weight[lp]:
                min = rp
                max = lp
                lp -= 1
                rp+=1
            else:
                min = rp
                max = rp+1
                rp+=2
        weight[addp] = weight[min]+weight[max]
        pos[min] = 0
        pos[max] = 1
        parent[min] = addp
        parent[max] = addp
        addp+=1
    return pos,parent

def sigmiod(n):
    return np.exp(n)/(1+np.exp(n))

def getHFMCode(word,pos,parent):
    global wordHash
    i = wordHash[word]
    code = []
    while parent[i]!=None:
        code.append(pos[i])
        i = parent[i]
    print("單詞'"+word+"'的哈弗曼編碼:"+str(code))
    return code

def updataParam(word,pos,parent,ansVec,projVec,paramVec):
    global wordHash
    i = wordHash[word]
    ll = 0
    paramChange = np.zeros((vecSize,wordNum-1))
    projChange = np.zeros((vecSize))
    while parent[i] != None:
        d = pos[i]
        n = ansVec[parent[i]-wordNum]
        try:
            ll += (1-d)*math.log(sigmiod(n))+d*math.log(1-sigmiod(n))
        except Exception:
            ll += 0
        m = 1-d-sigmiod(n)
        gradProj = m*projVec
        gradParam = m * paramVec[:,parent[i] - wordNum]
        paramChange[:,parent[i]-wordNum] += gradProj*u
        projChange += gradParam*u
        i = parent[i]
    return projChange,paramChange,ll


def initVec():
    global wordNum,vecSize
    wordVec = np.random.random((wordNum,vecSize))
    paramVec = np.zeros((vecSize,wordNum-1))
    for i in range(wordNum):
        for j in range(vecSize):
            wordVec[i][j] = (wordVec[i][j]-0.5)/vecSize
    return wordVec,paramVec

def train(wordVec,paramVec,pos,parent):
    global vecSize
    for k in range(t+1):
        paramChange = np.zeros((vecSize, wordNum - 1))
        wordChange = np.zeros((wordNum,vecSize))
        for i in range(len(words)):
            projVec = np.zeros(vecSize)
            n = 0
            for j in range(i-window,i):
                if j<0:
                   continue
                projVec += wordVec[wordHash[words[j]]]
                n+=1
            for j in range(i+1,i+window):
                if j>=len(words):
                    continue
                projVec += wordVec[wordHash[words[j]]]
                n+=1
            projVec = projVec/n
            ansVec = projVec.dot(paramVec)
            projChange1,paramChange1,ll = updataParam(words[i],pos,parent,ansVec,projVec,paramVec)
            for j in range(i - window, i):
                if j < 0:
                    continue
                wordChange[wordHash[words[j]]]+=projChange1
            for j in range(i + 1, i + window):
                if j >= len(words):
                    continue
                wordChange[wordHash[words[j]]]+=projChange1
            paramChange+=paramChange1
            if k%100==0:
                print("第"+str(k)+"輪訓練中,單詞"+words[i]+"的損失爲:"+str(ll))
        wordVec+=wordChange
        paramVec+=paramChange

print("開始讀取單詞")
read_file()
print("讀取單詞結束")
print("開始構建哈弗曼樹")
pos,parent = buildHFMTree()
print("構建完成")
#getHFMCode('interpretations',pos,parent)
print("開始初始化單詞向量")
wordVec,paramVec = initVec()
print("單詞向量初始完成")
print("準備訓練參數")
train(wordVec,paramVec,pos,parent)

語料:

In the near future the translation history will only be viewable when you log in to your account and it will be centrally managed in my activity record. This upgrade will clear the previous history so if you want the system to record certain translations for future review please be sure to save the translation results

執行結果:
在这里插入图片描述
在这里插入图片描述
不足
相對於原始碼,未實現負採樣,多執行緒和指數運算近似來降低運算的功能