Commit e2849028 authored by David Geisler's avatar David Geisler

small improvements

parent b569a6a4
......@@ -70,7 +70,132 @@ namespace utue::pe::tools::lsh {
* @brief Implements a MinHash algorithm to estimate the similarity of multiple Documents
*/
class MinHash {
public:
/**
* @brief Stores the result of one MinHash similarity calculation
*/
class Similarity {
public:
/**
* @brief Represents a report entry
*/
class ReportEntry {
public:
/**
* Investigated token
*/
Token m_token;
/**
* Frequency of the Token in the left Document
*/
float m_leftFreq;
/**
* Frequency of the Token in the right Document
*/
float m_rightFreq;
/**
* Frequency of the Token in the Dictionary
*/
float m_dictFreq;
/**
* Calculated weight of the Token
*/
float m_weight;
/**
* Fractal count of matches while thresholding
*/
float m_matches;
/**
* Constructs an empty ReportEntry
*/
ReportEntry();
};
/**
* The left Document
*/
DocumentPtr m_left;
/**
* The right Document
*/
DocumentPtr m_right;
/**
* Number of threshold to break down the Token frequencies to a boolean space
*/
unsigned int m_thresholds;
/**
* Random number generator seed
*/
unsigned int m_seed;
/**
* Sum of weights of all Token matches
*/
float m_score;
/**
* Sum of all weights
*/
float m_count;
/**
* Similarity score
*/
float m_similarity;
/**
* Number of hashes/permutations
*/
unsigned int m_hashes{};
/**
* Number of buckets per hash table
*/
unsigned int m_buckets;
/**
* Processing time in seconds
*/
float m_seconds;
/**
* Report of all investigated Token matches
*/
std::list<ReportEntry> m_report;
/**
* Creates a new Similarity object
* @param left Left Document
* @param right Right Document
* @param thresholds Number of thresholds
* @param seed Random number generator seed
* @param hashes Number of hashes/permutations
* @param buckets Number of buckets in hash tables
*/
Similarity(DocumentPtr left, DocumentPtr right, const unsigned int& thresholds, const unsigned int& seed, const unsigned int& hashes, const unsigned int& buckets);
/**
* Creates an empty Similarity object
*/
Similarity();
[[nodiscard]] std::string toString(const unsigned int& top = 20, int cw = 10) const;
};
private:
/**
* Dictionary of all related Documents
* @see Dictionary
......@@ -89,6 +214,16 @@ namespace utue::pe::tools::lsh {
*/
std::vector<Token::HashSet> m_permutations;
/**
* Number of hashes/permutations
*/
unsigned int m_hashes;
/**
* Number of buckets in each hash table
*/
unsigned int m_buckets;
public:
/**
......@@ -102,14 +237,14 @@ namespace utue::pe::tools::lsh {
* Creates a new document
* @return document Empty document linked to the MinHash Dictionary and Scanner
*/
DocumentPtr createDocument();
[[nodiscard]] DocumentPtr createDocument();
/**
* Creates a new document
* @param text content of the document
* @return Document with initialized contend linked to the MinHash Dictionary and Scanner
*/
DocumentPtr createDocument(const std::string& text);
[[nodiscard]] DocumentPtr createDocument(const std::string& text);
/**
* creates a new token scanner
......@@ -136,6 +271,12 @@ namespace utue::pe::tools::lsh {
*/
void add(const Token& token);
/**
* Calculates the similarity of to Documents
* @param similarity Read/Write container with two documents
*/
void similarity(Similarity& similarity);
/**
* Calculates the similarity of to Documents
* @param left The first Document
......@@ -144,7 +285,7 @@ namespace utue::pe::tools::lsh {
* @param seed Random number generator seed
* @return Similarity of two Documents
*/
float similarity(const DocumentPtr& left, const DocumentPtr& right, const unsigned int& thresholds = 10, const unsigned int& seed = 0);
[[nodiscard]] Similarity similarity(const DocumentPtr& left, const DocumentPtr& right, const unsigned int& thresholds = 10, const unsigned int& seed = 0);
/**
* Calculates the similarity of to string
......@@ -154,7 +295,7 @@ namespace utue::pe::tools::lsh {
* @param seed Random number generator seed
* @return Similarity of two strings
*/
float similarity(const std::string& left, const std::string& right, const unsigned int& thresholds = 10, const unsigned int& seed = 0);
[[nodiscard]] Similarity similarity(const std::string& left, const std::string& right, const unsigned int& thresholds = 10, const unsigned int& seed = 0);
/**
* Add a Scanner to the MinHash
......@@ -162,13 +303,31 @@ namespace utue::pe::tools::lsh {
*/
void add(const ScannerPtr& scanner);
/**
* Returns the dictionary
* @return Dictionary
*/
[[nodiscard]] const DictionaryPtr& dictionary() const;
/**
* Returns the maximal bucket load
* @return Bucket load
*/
[[nodiscard]] unsigned long maxLoad() const;
/**
* Returns the average bucket load
* @return Bucket load
*/
[[nodiscard]] float avgLoad() const;
/**
* Creates a MinHash object
* @param hashes Number of different hash functions resulting in different permutations.
* @param buckets Number of buckets in the chained hash set of Tokens
* @return Pointer to the created MinHash object
*/
static std::shared_ptr<MinHash> create(const unsigned int& hashes = 1000, const unsigned int& buckets = 500);
[[nodiscard]] static std::shared_ptr<MinHash> create(const unsigned int& hashes = 1000, const unsigned int& buckets = 500);
};
/**
......
......@@ -55,9 +55,9 @@
#ifndef UTUE_PE_TOOLS_LSH_SCANNER_H
#define UTUE_PE_TOOLS_LSH_SCANNER_H
#include <utue/pe/tools/lsh/Token.h>
#include <functional>
#include <memory>
#include <utue/pe/tools/lsh/Token.h>
namespace utue::pe::tools::lsh {
......
......@@ -147,9 +147,9 @@ namespace utue::pe::tools::lsh {
float m_weight;
/**
* Pointer to the value of the Token
* Value of the Token
*/
char* m_value;
char m_value[32];
/**
* Size of the value
......@@ -164,12 +164,6 @@ namespace utue::pe::tools::lsh {
*/
Token();
/**
* Copy a token
* @param token to copy
*/
Token(const Token& token);
/**
* Create a full initialized Token
* @param type Type of the Token
......@@ -267,6 +261,8 @@ namespace utue::pe::tools::lsh {
* @return String showing the Token variables
*/
[[nodiscard]] std::string toString() const;
static std::string typeToString(const Type& type);
};
}
......
......@@ -168,9 +168,11 @@ namespace utue::pe::utils {
HashSet(const unsigned int& buckets = 100, const unsigned long& seed = 0) :
m_hash(),
m_equal(),
m_buckets(buckets),
m_buckets(),
m_active_buckets(),
m_seed(seed) { }
m_seed(seed) {
this->m_buckets.resize(buckets);
}
/**
* Inserts an element to the HashSet
......@@ -192,12 +194,24 @@ namespace utue::pe::utils {
return this->m_buckets[i].emplace_back(element);
}
[[nodiscard]] bool has(const T& element) {
unsigned long i;
i = this->hash(element);
for (auto& it : this->m_buckets[i])
if (this->m_equal(it, element))
return true;
return false;
}
/**
* Returns a iterator container for fast bucket iteration.
* Only active buckets will be considered.
* @return Iterator container
*/
Iterator<FastBucketIterator> fastIterator() {
[[nodiscard]] Iterator<FastBucketIterator> fastIterator() {
return Iterator<FastBucketIterator>(this->m_active_buckets.begin(),this->m_active_buckets.end());
}
......@@ -206,7 +220,7 @@ namespace utue::pe::utils {
* All buckets will be considered, if active or not.
* @return Iterator container
*/
Iterator<SlowBucketIterator> slowIterator() {
[[nodiscard]] Iterator<SlowBucketIterator> slowIterator() {
return Iterator<SlowBucketIterator>(this->m_buckets.begin(),this->m_buckets.end());
}
......@@ -215,7 +229,7 @@ namespace utue::pe::utils {
* Only active buckets will be considered.
* @return Iterator container
*/
Iterator<FastConstBucketIterator> fastIterator() const {
[[nodiscard]] Iterator<FastConstBucketIterator> fastIterator() const {
return Iterator<FastConstBucketIterator>(this->m_active_buckets.begin(),this->m_active_buckets.end());
}
......@@ -224,10 +238,41 @@ namespace utue::pe::utils {
* All buckets will be considered, if active or not.
* @return Iterator container
*/
Iterator<SlowConstBucketIterator> slowIterator() const {
[[nodiscard]] Iterator<SlowConstBucketIterator> slowIterator() const {
return Iterator<SlowConstBucketIterator>(this->m_buckets.begin(),this->m_buckets.end());
}
/**
* Returns the maximal bucket load
* @return Bucket load
*/
[[nodiscard]] unsigned long maxLoad() const {
unsigned long load;
unsigned long tmp;
load = 0;
for(const auto& bucket : this->m_active_buckets) {
tmp = bucket->size();
if(tmp > load)
load = tmp;
}
return load;
}
/**
* Returns the average bucket load
* @return Bucket load
*/
[[nodiscard]] float avgLoad() const {
float load;
load = 0.0f;
for(const auto& bucket : this->m_active_buckets)
load += bucket->size();
return load / this->m_active_buckets.size();
}
protected:
/**
......
......@@ -124,7 +124,7 @@ namespace utue::pe::utils {
* Creates a time measurement relative to a given Tic
* @param tic Start timestamp
*/
Toc(const Tic& tic);
explicit Toc(const Tic& tic);
/**
* Creates a time measurement relative to the last Tic.
......
......@@ -84,6 +84,6 @@ namespace utue::pe::tools::lsh {
}
float Dictionary::frequency(const Token& token) const {
return float(this->m_tokens(token,0))/float(this->m_max);
return float(this->m_tokens(token,0))/float(this->m_count);
}
}
\ No newline at end of file
......@@ -58,10 +58,145 @@
#include <memory>
#include <cmath>
#include <random>
#include <utility>
#include <utue/pe/utils/TicToc.h>
#include <sstream>
#include <iomanip>
namespace utue::pe::tools::lsh {
MinHash::Similarity::ReportEntry::ReportEntry() : m_token(), m_leftFreq(0.0f), m_rightFreq(0.0f), m_dictFreq(0.0f), m_weight(0.0f), m_matches(0.0f) {};
MinHash::Similarity::Similarity(DocumentPtr left, DocumentPtr right, const unsigned int& thresholds, const unsigned int& seed, const unsigned int& hashes, const unsigned int& buckets) :
m_left(std::move(left)),
m_right(std::move(right)),
m_thresholds(thresholds),
m_seed(seed),
m_score(0.0f),
m_count(0.0f),
m_similarity(0.0f),
m_hashes(hashes),
m_buckets(buckets),
m_seconds(0.0f),
m_report() { }
MinHash::Similarity::Similarity() :
m_left(),
m_right(),
m_thresholds(0),
m_seed(0),
m_score(0.0f),
m_count(0.0f),
m_similarity(0.0f),
m_hashes(0),
m_buckets(0),
m_seconds(0.0f),
m_report() { }
std::string MinHash::Similarity::toString(const unsigned int& top, int cw) const {
std::stringstream ss;
std::list<ReportEntry> report;
unsigned int count;
//report.assign(this->m_report.begin(),this->m_report.end());
for(const auto& entry : this->m_report)
report.emplace_back(ReportEntry(entry));
ss << " --------------- R E P O R T ---------------" << std::endl;
ss << " Number of hashes: " << this->m_hashes << std::endl;
ss << " Number of buckets: " << this->m_buckets << std::endl;
ss << " Number of thresholds: " << this->m_thresholds << std::endl;
ss << " Seed: " << this->m_seed << std::endl;
ss << " Runtime: " << this->m_seconds << " seconds" << std::endl;
ss << " Similarity: " << this->m_similarity << std::endl;
ss << " +"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+" << std::endl;
ss << " | " << std::setw(cw*4+9) << std::setfill(' ') << "Token"
<< " | " << std::setw(cw) << std::setfill(' ') << "Left"
<< " | " << std::setw(cw) << std::setfill(' ') << "Right"
<< " | " << std::setw(cw) << std::setfill(' ') << "Dictionary"
<< " | " << std::setw(cw*3+6) << std::setfill(' ') << "MinHash"
<< " | " << std::endl;
ss << " +"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+" << std::endl;
ss << " | " << std::setw(cw) << std::setfill(' ') << "Type"
<< " | " << std::setw(cw) << std::setfill(' ') << "Length"
<< " | " << std::setw(cw) << std::setfill(' ') << "Value"
<< " | " << std::setw(cw) << std::setfill(' ') << "Weight"
<< " | " << std::setw(cw) << std::setfill(' ') << "Frequency"
<< " | " << std::setw(cw) << std::setfill(' ') << "Frequency"
<< " | " << std::setw(cw) << std::setfill(' ') << "Frequency"
<< " | " << std::setw(cw) << std::setfill(' ') << "Matches"
<< " | " << std::setw(cw) << std::setfill(' ') << "Weight"
<< " | " << std::setw(cw) << std::setfill(' ') << "Score"
<< " | " << std::endl;
ss << " +"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+" << std::endl;
report.sort(
[](const ReportEntry& left, const ReportEntry& right) -> bool {
return (left.m_matches*left.m_weight) > (right.m_matches*right.m_weight);
});
count = 0;
for(const auto& entry : report) {
count++;
ss << " | " << std::setw(cw+0) << std::setfill(' ') << Token::typeToString(entry.m_token.type())
<< " | " << std::setw(cw+0) << std::setfill(' ') << entry.m_token.length()
<< " | " << std::setw(cw+0) << std::setfill(' ') << ("\"" + std::string(entry.m_token.value(),entry.m_token.length()) + "\"")
<< " | " << std::setw(cw+0) << std::setfill(' ') << std::fixed << std::setprecision( 2 )<< entry.m_token.weight()
<< " | " << std::setw(cw-1) << std::setfill(' ') << std::fixed << std::setprecision( 2 )<< 100.0f*entry.m_leftFreq << "%"
<< " | " << std::setw(cw-1) << std::setfill(' ') << std::fixed << std::setprecision( 2 )<< 100.0f*entry.m_rightFreq << "%"
<< " | " << std::setw(cw-1) << std::setfill(' ') << std::fixed << std::setprecision( 2 )<< 100.0f*entry.m_dictFreq << "%"
<< " | " << std::setw(cw-1) << std::setfill(' ') << std::fixed << std::setprecision( 2 )<< 100.0f*entry.m_matches << "%"
<< " | " << std::setw(cw-1) << std::setfill(' ') << std::fixed << std::setprecision( 2 )<< 100.0f*entry.m_weight/this->m_count << "%"
<< " | " << std::setw(cw-1) << std::setfill(' ') << std::fixed << std::setprecision( 2 )<< 100.0f*entry.m_matches*entry.m_weight/this->m_count << "%"
<< " | " << std::endl;
if(count == top)
break;
}
ss << " +"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+" << std::endl;
return ss.str();
}
class MinHashDictionary : public Dictionary {
private:
MinHash* m_instance;
......@@ -94,9 +229,11 @@ namespace utue::pe::tools::lsh {
MinHash::MinHash(const unsigned int& hashes, const unsigned int& buckets) :
m_dictionary(new MinHashDictionary(this)),
m_scanner(new MinHashScanner()),
m_permutations() {
m_permutations(),
m_hashes(hashes),
m_buckets(buckets) {
for(unsigned int i = 0; i < hashes; i++) {
this->m_permutations.emplace_back(Token::HashSet(buckets,i));
this->m_permutations.emplace_back(buckets,i);
}
}
......@@ -117,75 +254,96 @@ namespace utue::pe::tools::lsh {
permutation.insert(token);
}
float MinHash::similarity(const DocumentPtr& left, const DocumentPtr& right, const unsigned int& thresholds, const unsigned int& seed) {
float leftFreq;
float rightFreq;
float dictFreq;
void MinHash::similarity(Similarity& similarity) {
Similarity::ReportEntry entry;
utils::Tic tic;
// Hyperplane to threshold the frequencies
float thresh;
float weight;
float score;
float count;
utils::Random::seed(seed);
bool done;
count = 0.0f;
score = 0.0f;
// cleanup report
similarity.m_report.clear();
// set seed for random number generator
utils::Random::seed(similarity.m_seed);
similarity.m_count = 0.0f;
similarity.m_score = 0.0f;
similarity.m_hashes = this->m_hashes;
similarity.m_buckets = this->m_buckets;
// Iterate over permutations
for(const auto& permutation : this->m_permutations) {
// skip if permutation list is empty
done = false;
// Iterate over buckets
for(const auto& bucket : permutation.slowIterator()) { //Its important to use the slow iterator here!
//The fast iterator almost depicts the insertion
//order of the Tokens. The permutation is therefore
//weak, and earlier Tokens are higher weighted.
// iterate over tokens in permutation list
//The fast iterator almost depicts the insertion
//order of the Tokens. The permutation is therefore
//weak, and earlier Tokens are higher weighted.
// Iterate over tokens in permutation list