Commit 7f5343fe authored by David Geisler's avatar David Geisler

rehash implemented

parent c32b8a2b
......@@ -57,6 +57,8 @@
#include <utue/pe/utils/HashMap.h>
#define MAX_TOKEN_SIZE 12
namespace utue::pe::tools::lsh {
/**
......@@ -142,7 +144,7 @@ namespace utue::pe::tools::lsh {
/**
* Value of the Token
*/
char m_value[32];
char m_value[MAX_TOKEN_SIZE];
/**
* Size of the value
......
Subproject commit 6f87d658c8d5a348f61eba50165858b03fbe60dd
Subproject commit 63bd4c7638fe1743ba93c493fbba4ec5f10b33e4
......@@ -263,8 +263,6 @@ namespace utue::pe::tools::lsh {
// Hyperplane to threshold the frequencies
float thresh;
bool done;
// cleanup report
similarity.m_report.clear();
......@@ -278,58 +276,49 @@ namespace utue::pe::tools::lsh {
// Iterate over permutations
for(const auto& permutation : this->m_permutations) {
done = false;
// Iterate over buckets
for(const auto& bucket : permutation.slowIterator()) { //Its important to use the slow iterator here!
//The fast iterator almost depicts the insertion
//order of the Tokens. The permutation is therefore
//weak, and earlier Tokens are higher weighted.
// Iterate over tokens in permutation list
for (const auto& token : bucket) {
//todo: In worst case both documents are
// disjoint and we run through all
// tokens
entry.m_token = token;
// Get frequency of the token
entry.m_leftFreq = similarity.m_left->frequency(entry.m_token);
entry.m_rightFreq = similarity.m_right->frequency(entry.m_token);
entry.m_dictFreq = this->m_dictionary->frequency(entry.m_token);
if(entry.m_leftFreq == 0.0f && entry.m_rightFreq == 0.0f) {
// Token does not occur in both docs
continue; // Process next token
}
// Word appears at least in one of both documents
// A low overall dictionary frequency leads to the conclusion that the Token is more expressive
// for those documents. Its therefore higher weighted by the division of the low frequency.
entry.m_weight = entry.m_token.weight();//entry.m_dictFreq;
// Process random hyper planes to break down the frequencies to boolean values
entry.m_matches = 0.0f;
for(unsigned int j = 0; j < similarity.m_thresholds; j++) {
similarity.m_count += entry.m_weight/float(similarity.m_thresholds);
// Get random threshold to cut frequency domain down to boolean domain
thresh = utils::Random::Uniform::range(0.0f, fmaxf(entry.m_leftFreq, entry.m_rightFreq));
// Count an match if both frequencies are on the same side of the plane/threshold
if ((entry.m_leftFreq > thresh) == (entry.m_rightFreq > thresh)) {
entry.m_matches += 1.0f/float(similarity.m_thresholds);
similarity.m_score += entry.m_weight / float(similarity.m_thresholds);
}
//Its important to use the slow iterator here!
//The fast iterator almost depicts the insertion
//order of the Tokens. The permutation is therefore
//weak, and earlier Tokens are higher weighted.
// Iterate over tokens in permutation list
permutation.slowBreakableForeach([&] (const Token& token) -> bool {
entry.m_token = token;
// Get frequency of the token
entry.m_leftFreq = similarity.m_left->frequency(entry.m_token);
entry.m_rightFreq = similarity.m_right->frequency(entry.m_token);
if(entry.m_leftFreq == 0.0f && entry.m_rightFreq == 0.0f) {
// Token does not occur in both docs
return true; // Process next token
}
// Word appears at least in one of both documents
entry.m_dictFreq = this->m_dictionary->frequency(entry.m_token);
// A low overall dictionary frequency leads to the conclusion that the Token is more expressive
// for those documents. Its therefore higher weighted by the division of the low frequency.
entry.m_weight = entry.m_token.weight();//entry.m_dictFreq;
// Process random hyper planes to break down the frequencies to boolean values
entry.m_matches = 0.0f;
for(unsigned int j = 0; j < similarity.m_thresholds; j++) {
similarity.m_count += entry.m_weight/float(similarity.m_thresholds);
// Get random threshold to cut frequency domain down to boolean domain
thresh = utils::Random::Uniform::range(0.0f, fmaxf(entry.m_leftFreq, entry.m_rightFreq));
// Count an match if both frequencies are on the same side of the plane/threshold
if ((entry.m_leftFreq > thresh) == (entry.m_rightFreq > thresh)) {
entry.m_matches += 1.0f/float(similarity.m_thresholds);
similarity.m_score += entry.m_weight / float(similarity.m_thresholds);
}
}
// Match found -> done!
similarity.m_report.emplace_back(entry);
done = true;
break;
};
if(done)
break;
}
// Match found -> done!
similarity.m_report.emplace_back(entry);
return false;
});
}
if(similarity.m_count > 0.0f)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment