作業:編寫一個程式,給檔案生成simhash指紋。可以對詞使用任意合理的雜湊函數。使用該程式對計算機上的重複檔案進行檢測,得出檢測的準確率。檢測的準確率隨著指紋大小的有什麼變化?
1、文字分詞,得到關鍵詞:權重(feature:weight)
對文字進行關鍵詞抽取(分詞和計算權重),抽出權重最高的前n(關鍵詞和權重)對,可利用jieba.analyse.extract_tags()實現,即一個文字得到一個長度為n(feature:weight)的集合。
2、hash
對獲取的詞(feature),進行普通的雜湊操作之後,計算hash值,這樣就得到一個長度為n位的二進位制,得到(hash:weight)的集合。
3、加權
在獲取的hash值的基礎上,根據對應的weight值進行加權,W=hash*weight。即hash為1則和weight正相乘,為0則和weight負相乘。例如一個詞經過hash後得到(010111:5)經過步驟(3)之後可以得到列表[-5,5,-5,5,5,5]。
4、合併
將上述得到的各個向量的加權結果進行求和,變成只有一個序列串。如[-5,5,-5,5,5,5]、[-3,-3,-3,3,-3,3]、[1,-1,-1,1,1,1]進行列向累加得到[-7,1,-9,9,3,9],這樣,我們對一個檔案得到,一個長度為64的列表。
5、降維
對於得到的n-bit簽名的累加結果的每個值進行判斷,大於0則置為1, 否則置為0,從而得到該語句的simhash值。例如,[-7,1,-9,9,3,9]得到 010111,這樣,我們就得到一個檔案的 simhash值。最後根據不同語句的simhash值的漢明距離來判斷相似度。
# -*- coding:utf-8 -*-
import jieba
import jieba.analyse
import numpy as np
class simhash:
# 建構函式
def __init__(self, content):
self.hash = self.simhash(content)
def __str__(self):
return str(self.hash)
# 生成simhash值
def simhash(self, content):
seg = jieba.cut(content)
#jieba.analyse.set_stop_words('stopword.txt')
# jieba基於TF-IDF提取關鍵詞,前10位
keyWords = jieba.analyse.extract_tags("|".join(seg), topK=10, withWeight=True, allowPOS=())
#print(keyWords)
keyList = []
for feature, weight in keyWords:
#print('feature:{},weight: {}'.format(feature,weight))
weight = int(weight)
#生成普通的的hash值
binstr = self.string_hash(feature)
temp = []
for c in binstr:
if (c == '1'):# 檢視當前bit位是否為1,是的話將weight*1加入temp[]
temp.append(weight)
else:#否則的話,將weight*-1加入temp[]
temp.append(-weight)
keyList.append(temp)
listSum = np.sum(np.array(keyList), axis=0)
if (keyList == []):#編碼讀不出來
return '00'
simhash = ''
for i in listSum:
if (i > 0):
simhash = simhash + '1'
else:
simhash = simhash + '0'
return simhash# 整個檔案的fingerprint為最終各個位>=0的和
# 求海明距離
def hamming_distance(self, other):
t1 = '0b' + self.hash
t2 = '0b' + other.hash
n = int(t1, 2) ^ int(t2, 2)
i = 0
while n:
n &= (n - 1)
i += 1
return i
#計算相似度
def similarity(self, other):
a = float(self.hash)
b = float(other.hash)
if a > b:
return b / a
else:
return a / b
# 針對source生成hash值 (一個可變長度版本的Python的內建雜湊)
def string_hash(self, source):
if source == "":
return 0
else:
x = ord(source[0]) << 7
m = 1000003
mask = 2 ** 128 - 1
for c in source:
x = ((x * m) ^ ord(c)) & mask
x ^= len(source)
if x == -1:
x = -2
x = bin(x).replace('0b', '').zfill(64)[-64:]
#print('strint_hash: %s, %s'%(source, x))
return str(x)
if __name__ == '__main__':
hash1 = simhash('我想洗照片')
hash2 = simhash('可以洗一張照片嗎')
print("海明距離:", hash1.hamming_distance(hash2))
print("文字相似度:", hash1.similarity(hash2))
class simhash:
# 建構函式
def __init__(self, tokens='', hashbits=128):
self.hashbits = hashbits
self.hash = self.simhash(tokens);
# toString函數
def __str__(self):
return str(self.hash)
# 生成simhash值
def simhash(self, tokens):
v = [0] * self.hashbits
for t in [self._string_hash(x) for x in tokens]: # t為token的普通hash值
for i in range(self.hashbits):
bitmask = 1 << i
if t & bitmask:
v[i] += 1 # 檢視當前bit位是否為1,是的話將該位+1
else:
v[i] -= 1 # 否則的話,該位-1
fingerprint = 0
for i in range(self.hashbits):
if v[i] >= 0:
fingerprint += 1 << i
return fingerprint # 整個檔案的fingerprint為最終各個位>=0的和
# 求海明距離
def hamming_distance(self, other):
x = (self.hash ^ other.hash) & ((1 << self.hashbits) - 1)
tot = 0
while x:
tot += 1
x &= x - 1
return tot
# 求相似度
def similarity(self, other):
a = float(self.hash)
b = float(other.hash)
#print("a:",a, b, end='\n')
if a > b:
return b / a
else:
return a / b
# 針對source生成hash值 (一個可變長度版本的Python的內建雜湊)
def _string_hash(self, source):
if source == "":
return 0
else:
x = ord(source[0]) << 7
m = 1000003
mask = 2 ** self.hashbits - 1
for c in source:
x = ((x * m) ^ ord(c)) & mask
x ^= len(source)
if x == -1:
x = -2
return x
if __name__ == '__main__':
s = 'This is a test string for testing'
hash1 = simhash(s.split())
s = 'This is a test string for testing also'
hash2 = simhash(s.split())
s = 'This is a test'
hash3 = simhash(s.split())
print(hash1,hash2,hash3)
print(hash1.hamming_distance(hash2), "\t", hash1.similarity(hash2))
print(hash1.hamming_distance(hash3), "\t", hash1.similarity(hash3))
# -*- coding:utf-8 -*-
import jieba
import jieba.analyse
import numpy as np
import re
txt1 = r'./test1.txt'
txt2 = r'./test2.txt'
class simhash:
# 建構函式
def __init__(self, content):
self.hash = self.simhash(content)
def __str__(self):
return str(self.hash)
# 生成simhash值
def simhash(self, content):
count = 0
seg = jieba.cut(content)
# jieba基於TF-IDF提取前10位關鍵詞
keyWords = jieba.analyse.extract_tags("|".join(seg), topK=10, withWeight=True, allowPOS=())
keyList = []
# 獲取每個詞的權重
for feature, weight in keyWords:
#print('feature:{},weight: {}'.format(feature, weight))
# 每個關鍵詞的權重*總單詞數
weight = int(weight * 10)
#生成普通的的hash值
binstr = self.string_hash(feature)
#列印指紋大小
if(count == 0):
print("指紋大小為:", len(binstr))
count += 1
temp = []
for c in binstr:
if (c == '1'):# 檢視當前bit位是否為1,是的話將weight*1加入temp[]
temp.append(weight)
else:#否則的話,將weight*-1加入temp[]
temp.append(-weight)
keyList.append(temp)
# 將每個關鍵詞的權重變成一維矩陣
listSum = np.sum(np.array(keyList), axis=0)
if (keyList == []):#編碼讀不出來
return '00'
simhash = ''
for i in listSum:
if (i > 0):
simhash = simhash + '1'
else:
simhash = simhash + '0'
return simhash# 整個檔案的fingerprint為最終各個位>=0的和
# 求海明距離
def hamming_distance(self, other):
t1 = '0b' + self.hash
t2 = '0b' + other.hash
n = int(t1, 2) ^ int(t2, 2)
i = 0
while n:
n &= (n - 1)
i += 1
return i
#計算相似度
def similarity(self, other):
a = float(self.hash)
b = float(other.hash)
print(a, b)
if a > b:
return b / a
#elif a == 0.0 and b == 0.0:
#return 1
else:
return a / b
# 針對source生成hash值 (一個可變長度版本的Python的內建雜湊)
def string_hash(self, source):
if source == "":
return 0
else:
# 將字元轉為二進位制,並向左移動7位
x = ord(source[0]) << 7
m = 1000003
mask = 2 ** 128 - 1
# 拼接每個關鍵詞中字元的特徵
for c in source:
x = ((x * m) ^ ord(c)) & mask
x ^= len(source)
if x == -1:
x = -2
#通過改變.zfill(16)[-16:]來實現改變指紋大小
x = bin(x).replace('0b', '').zfill(32)[-32:]
#print('strint_hash: %s, %s' % (source, x))
return str(x)
def txt_line(txt1, txt2):
punc = './ <>_ - - = ", 。,?!「」:‘’@#¥% … &×()——+【】{};;● &~| \s:'
#獲取文字中的資料
with open(txt1, 'r', encoding='gbk') as f:
list1 = f.read()
string = ''
text1 = re.sub(r'[^\w]+', '', list1)
s = jieba.cut(text1)
string = string.join(s)
line1 = re.sub(r"[{}]+".format(punc), "", string)
with open(txt2, 'r', encoding='gbk') as f:
list2 = f.read()
string = ''
text2 = re.sub(r'[^\w]+', '', list2)
s = jieba.cut(text2)
string = string.join(s)
line2 = re.sub(r"[{}]+".format(punc), "", string)
hash1 = simhash(line1)
hash2 = simhash(line2)
print("海明距離:", hash1.hamming_distance(hash2))
print("文字相似度:", hash1.similarity(hash2))
if __name__ == '__main__':
txt_line(txt1, txt2)
simHash介紹及python實現
python使用simhash實現文字相似性對比(全程式碼展示)
[轉]檔案去重演演算法:SimHash和MinHash
淺談simhash及其python實現