
java simhash是什么?让我们一起来了解一下吧!
java simhash是java程序中的一种算法。Simhash算法产生与2002年,设计非常美妙,它输入是一个向量,得出的结果是一个F位的签名值。

Simhash和一般的hash算法不同,它具有两个关键的特点:
1.一个文档的指纹是所有属性的某种hash;
2.相似文档的hash应该是相似的;
simhash 算法如下:1,将一个 f 维的向量 V 初始化为 0 ; f 位的二进制数 S 初始化为 0 ;2,对每一个特征:用传统的 hash 算法对该特征产生一个 f 位的签名 b 。对 i=1 到 f :如果b 的第 i 位为 1 ,则 V 的第 i 个元素加上该特征的权重;否则,V 的第 i 个元素减去该特征的权重。 3,如果 V 的第 i 个元素大于 0 ,则 S 的第 i 位为 1 ,否则为 0 ;4,输出 S 作为签名。
simhash 算法代码:
package com.xxxx.checkandbigdataquery.utils;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import it.unimi.dsi.fastutil.longs.LongSet;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.CharBuffer;
import java.util.Set;
/**
* a basic SimHash implementation
*
*
*/
public class SimHash {
public static final int HASH_SIZE = 64;
public static final long HASH_RANGE = 2 ^ HASH_SIZE;
public static MurmurHash hasher = new MurmurHash();
/**
* use short cuts to obtains a speed optimized simhash calculation
*
* @param s
* input string
* @return 64 bit simhash of input string
*/
private static final int FIXED_CGRAM_LENGTH = 4;
public static long computeOptimizedSimHashForString(String s) {
return computeOptimizedSimHashForString(CharBuffer.wrap(s));
}
public static long computeOptimizedSimHashForString(CharBuffer s) {
LongSet shingles = new LongOpenHashSet(Math.min(s.length(), 100000));
int length = s.length();
long timeStart = System.currentTimeMillis();
for (int i = 0; i < length - FIXED_CGRAM_LENGTH + 1; i++) {
// extract an ngram
long shingle = s.charAt(i);
shingle <<= 16;
shingle |= s.charAt(i + 1);
shingle <<= 16;
shingle |= s.charAt(i + 2);
shingle <> 56);
longAsBytes[1] = (byte) (shingle >> 48);
longAsBytes[2] = (byte) (shingle >> 40);
longAsBytes[3] = (byte) (shingle >> 32);
longAsBytes[4] = (byte) (shingle >> 24);
longAsBytes[5] = (byte) (shingle >> 16);
longAsBytes[6] = (byte) (shingle >> 8);
longAsBytes[7] = (byte) (shingle);
long longHash = FPGenerator.std64.fp(longAsBytes, 0, 8);
for (int i = 0; i < HASH_SIZE; ++i) {
boolean bitSet = ((longHash >> i) & 1L) == 1L;
v[i] += (bitSet) ? 1 : -1;
}
}
long simhash = 0;
for (int i = 0; i < HASH_SIZE; ++i) {
if (v[i] > 0) {
simhash |= (1L << i);
}
}
return simhash;
}
public static long computeSimHashFromString(Set shingles) {
int v[] = new int[HASH_SIZE];
// compute a set of shingles
for (String shingle : shingles) {
byte[] bytes = shingle.getBytes();
long longHash = FPGenerator.std64.fp(bytes, 0, bytes.length);
// long hash1 = hasher.hash(bytes, bytes.length, 0);
// long hash2 = hasher.hash(bytes, bytes.length, (int)hash1);
// long longHash = (hash1 << 32) | hash2;
for (int i = 0; i < HASH_SIZE; ++i) {
boolean bitSet = ((longHash >> i) & 1L) == 1L;
v[i] += (bitSet) ? 1 : -1;
}
}
long simhash = 0;
for (int i = 0; i < HASH_SIZE; ++i) {
if (v[i] > 0) {
simhash |= (1L << i);
}
}
return simhash;
}
public static int hammingDistance(long hash1, long hash2) {
long bits = hash1 ^ hash2;
int count = 0;
while (bits != 0) {
bits &= bits - 1;
++count;
}
return count;
}
public static long rotate(long hashValue) {
return (hashValue << 1) | (hashValue >>> -1);
}
public static void main(String[] args) {
try {
// File file1 = new File("/Users/rana/academia.edu_01.html");
// File file2 = new File("/Users/rana/academia.edu_02.html");
File file1 = new File(args[0]);
File file2 = new File(args[1]);
byte data1[] = new byte[(int) file1.length()];
byte data2[] = new byte[(int) file2.length()];
FileInputStream stream1 = new FileInputStream(file1);
FileInputStream stream2 = new FileInputStream(file2);
stream1.read(data1);
stream2.read(data2);
String string1 = new String(data1);
String string2 = new String(data2);
long timeStart = System.currentTimeMillis();
long simhash1 = computeSimHashFromString(Shingle.shingles(string1));
long timeEnd = System.currentTimeMillis();
System.out.println("Old Calc for Document A Took:"
+ (timeEnd - timeStart));
timeStart = System.currentTimeMillis();
long simhash2 = computeSimHashFromString(Shingle.shingles(string2));
timeEnd = System.currentTimeMillis();
System.out.println("Old Calc for Document B Took:"
+ (timeEnd - timeStart));
timeStart = System.currentTimeMillis();
long simhash3 = computeOptimizedSimHashForString(string1);
timeEnd = System.currentTimeMillis();
System.out.println("New Calc for Document A Took:"
+ (timeEnd - timeStart));
timeStart = System.currentTimeMillis();
long simhash4 = computeOptimizedSimHashForString(string2);
timeEnd = System.currentTimeMillis();
System.out.println("New Calc for Document B Took:"
+ (timeEnd - timeStart));
int hammingDistance = hammingDistance(simhash1, simhash2);
int hammingDistance2 = hammingDistance(simhash3, simhash4);
System.out.println("hammingdistance Doc (A) to Doc(B) OldWay:"
+ hammingDistance);
System.out.println("hammingdistance Doc (A) to Doc(B) NewWay:"
+ hammingDistance2);
} catch (IOException e) {
e.printStackTrace();
}
}
}以上就是小编今天的分享了,希望可以帮助到大家。
