simhash算法的python实现_木木彡_新浪博客

  • A+
所属分类:其他杂项
本文信息本文由方法SEO顾问发表于2016-06-1416:34:25,共 1863 字,转载请注明:simhash算法的python实现_木木彡_新浪博客_【方法SEO顾问】

参考:

http://grunt1223.iteye.com/blog/964564
http://www.cnblogs.com/coder2012/p/3293288.html

【code】

#!/usr/bin/python
# coding=utf-8
class simhash:
    
    #构造函数
    def
__init__(self, tokens='',
hashbits=128):      
 
       
self.hashbits = hashbits
       
self.hash = self.simhash(tokens);
    
   
#toString函数  
 
    def
__str__(self):
       
return str(self.hash)
    
   
#生成simhash值  
 
    def
simhash(self, tokens):
       
v = [0] * self.hashbits
       
for t in [self._string_hash(x) for x in tokens]:
#t为token的普通hash值         
 
           
for i in range(self.hashbits):
               
bitmask = 1 << i
               
if t & bitmask :
                   
v[i] += 1 #查看当前bit位是否为1,是的话将该位+1
               
else:
                   
v[i] -= 1 #否则的话,该位-1
       
fingerprint = 0
       
for i in range(self.hashbits):
           
if v[i] >= 0:
               
fingerprint += 1 << i
       
return fingerprint #整个文档的fingerprint为最终各个位>=0的和
    
    #求海明距离
    def
hamming_distance(self, other):
       
x = (self.hash ^ other.hash) & ((1 << self.hashbits) -
1)
       
tot = 0;
       
while x :
           
tot += 1
           
x &= x - 1
       
return tot
    
    #求相似度
    def
similarity (self, other):
       
a = float(self.hash)
       
b = float(other.hash)
       
if a > b : return b / a
       
else: return a / b
    
   
#针对source生成hash值  
(一个可变长度版本的Python的内置散列)
    def
_string_hash(self,
source):      
 
       
if source == "":
           
return 0
       
else:
           
x = ord(source[0]) << 7
           
m = 1000003
           
mask = 2 ** self.hashbits - 1
           
for c in source:
               
x = ((x * m) ^ ord(c)) & mask
           
x ^= len(source)
           
if x == -1:
               
x = -2
           
return x
           
 
if __name__ == '__main__':
    s = 'This is
a test string for testing'
    hash1 =
simhash(s.split())
    
    s = 'This is
a test string for testing also'
    hash2 =
simhash(s.split())
    
    s = 'nai nai
ge xiong cao'
    hash3 =
simhash(s.split())
    
   
print(hash1.hamming_distance(hash2) ,
  " ,
hash1.similarity(hash2))
   
print(hash1.hamming_distance(hash3) ,
  " ,
hash1.similarity(hash3))

发表评论

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

目前评论:1   其中:访客  0   博主  0

    • avatar 风继续吹

      这代码看着好方