Commit e2849028 authored by David Geisler's avatar David Geisler

small improvements

parent b569a6a4
...@@ -70,7 +70,132 @@ namespace utue::pe::tools::lsh { ...@@ -70,7 +70,132 @@ namespace utue::pe::tools::lsh {
* @brief Implements a MinHash algorithm to estimate the similarity of multiple Documents * @brief Implements a MinHash algorithm to estimate the similarity of multiple Documents
*/ */
class MinHash { class MinHash {
public:
/**
* @brief Stores the result of one MinHash similarity calculation
*/
class Similarity {
public:
/**
* @brief Represents a report entry
*/
class ReportEntry {
public:
/**
* Investigated token
*/
Token m_token;
/**
* Frequency of the Token in the left Document
*/
float m_leftFreq;
/**
* Frequency of the Token in the right Document
*/
float m_rightFreq;
/**
* Frequency of the Token in the Dictionary
*/
float m_dictFreq;
/**
* Calculated weight of the Token
*/
float m_weight;
/**
* Fractal count of matches while thresholding
*/
float m_matches;
/**
* Constructs an empty ReportEntry
*/
ReportEntry();
};
/**
* The left Document
*/
DocumentPtr m_left;
/**
* The right Document
*/
DocumentPtr m_right;
/**
* Number of threshold to break down the Token frequencies to a boolean space
*/
unsigned int m_thresholds;
/**
* Random number generator seed
*/
unsigned int m_seed;
/**
* Sum of weights of all Token matches
*/
float m_score;
/**
* Sum of all weights
*/
float m_count;
/**
* Similarity score
*/
float m_similarity;
/**
* Number of hashes/permutations
*/
unsigned int m_hashes{};
/**
* Number of buckets per hash table
*/
unsigned int m_buckets;
/**
* Processing time in seconds
*/
float m_seconds;
/**
* Report of all investigated Token matches
*/
std::list<ReportEntry> m_report;
/**
* Creates a new Similarity object
* @param left Left Document
* @param right Right Document
* @param thresholds Number of thresholds
* @param seed Random number generator seed
* @param hashes Number of hashes/permutations
* @param buckets Number of buckets in hash tables
*/
Similarity(DocumentPtr left, DocumentPtr right, const unsigned int& thresholds, const unsigned int& seed, const unsigned int& hashes, const unsigned int& buckets);
/**
* Creates an empty Similarity object
*/
Similarity();
[[nodiscard]] std::string toString(const unsigned int& top = 20, int cw = 10) const;
};
private: private:
/** /**
* Dictionary of all related Documents * Dictionary of all related Documents
* @see Dictionary * @see Dictionary
...@@ -89,6 +214,16 @@ namespace utue::pe::tools::lsh { ...@@ -89,6 +214,16 @@ namespace utue::pe::tools::lsh {
*/ */
std::vector<Token::HashSet> m_permutations; std::vector<Token::HashSet> m_permutations;
/**
* Number of hashes/permutations
*/
unsigned int m_hashes;
/**
* Number of buckets in each hash table
*/
unsigned int m_buckets;
public: public:
/** /**
...@@ -102,14 +237,14 @@ namespace utue::pe::tools::lsh { ...@@ -102,14 +237,14 @@ namespace utue::pe::tools::lsh {
* Creates a new document * Creates a new document
* @return document Empty document linked to the MinHash Dictionary and Scanner * @return document Empty document linked to the MinHash Dictionary and Scanner
*/ */
DocumentPtr createDocument(); [[nodiscard]] DocumentPtr createDocument();
/** /**
* Creates a new document * Creates a new document
* @param text content of the document * @param text content of the document
* @return Document with initialized contend linked to the MinHash Dictionary and Scanner * @return Document with initialized contend linked to the MinHash Dictionary and Scanner
*/ */
DocumentPtr createDocument(const std::string& text); [[nodiscard]] DocumentPtr createDocument(const std::string& text);
/** /**
* creates a new token scanner * creates a new token scanner
...@@ -136,6 +271,12 @@ namespace utue::pe::tools::lsh { ...@@ -136,6 +271,12 @@ namespace utue::pe::tools::lsh {
*/ */
void add(const Token& token); void add(const Token& token);
/**
* Calculates the similarity of to Documents
* @param similarity Read/Write container with two documents
*/
void similarity(Similarity& similarity);
/** /**
* Calculates the similarity of to Documents * Calculates the similarity of to Documents
* @param left The first Document * @param left The first Document
...@@ -144,7 +285,7 @@ namespace utue::pe::tools::lsh { ...@@ -144,7 +285,7 @@ namespace utue::pe::tools::lsh {
* @param seed Random number generator seed * @param seed Random number generator seed
* @return Similarity of two Documents * @return Similarity of two Documents
*/ */
float similarity(const DocumentPtr& left, const DocumentPtr& right, const unsigned int& thresholds = 10, const unsigned int& seed = 0); [[nodiscard]] Similarity similarity(const DocumentPtr& left, const DocumentPtr& right, const unsigned int& thresholds = 10, const unsigned int& seed = 0);
/** /**
* Calculates the similarity of to string * Calculates the similarity of to string
...@@ -154,7 +295,7 @@ namespace utue::pe::tools::lsh { ...@@ -154,7 +295,7 @@ namespace utue::pe::tools::lsh {
* @param seed Random number generator seed * @param seed Random number generator seed
* @return Similarity of two strings * @return Similarity of two strings
*/ */
float similarity(const std::string& left, const std::string& right, const unsigned int& thresholds = 10, const unsigned int& seed = 0); [[nodiscard]] Similarity similarity(const std::string& left, const std::string& right, const unsigned int& thresholds = 10, const unsigned int& seed = 0);
/** /**
* Add a Scanner to the MinHash * Add a Scanner to the MinHash
...@@ -162,13 +303,31 @@ namespace utue::pe::tools::lsh { ...@@ -162,13 +303,31 @@ namespace utue::pe::tools::lsh {
*/ */
void add(const ScannerPtr& scanner); void add(const ScannerPtr& scanner);
/**
* Returns the dictionary
* @return Dictionary
*/
[[nodiscard]] const DictionaryPtr& dictionary() const;
/**
* Returns the maximal bucket load
* @return Bucket load
*/
[[nodiscard]] unsigned long maxLoad() const;
/**
* Returns the average bucket load
* @return Bucket load
*/
[[nodiscard]] float avgLoad() const;
/** /**
* Creates a MinHash object * Creates a MinHash object
* @param hashes Number of different hash functions resulting in different permutations. * @param hashes Number of different hash functions resulting in different permutations.
* @param buckets Number of buckets in the chained hash set of Tokens * @param buckets Number of buckets in the chained hash set of Tokens
* @return Pointer to the created MinHash object * @return Pointer to the created MinHash object
*/ */
static std::shared_ptr<MinHash> create(const unsigned int& hashes = 1000, const unsigned int& buckets = 500); [[nodiscard]] static std::shared_ptr<MinHash> create(const unsigned int& hashes = 1000, const unsigned int& buckets = 500);
}; };
/** /**
......
...@@ -55,9 +55,9 @@ ...@@ -55,9 +55,9 @@
#ifndef UTUE_PE_TOOLS_LSH_SCANNER_H #ifndef UTUE_PE_TOOLS_LSH_SCANNER_H
#define UTUE_PE_TOOLS_LSH_SCANNER_H #define UTUE_PE_TOOLS_LSH_SCANNER_H
#include <utue/pe/tools/lsh/Token.h>
#include <functional> #include <functional>
#include <memory> #include <memory>
#include <utue/pe/tools/lsh/Token.h>
namespace utue::pe::tools::lsh { namespace utue::pe::tools::lsh {
......
...@@ -147,9 +147,9 @@ namespace utue::pe::tools::lsh { ...@@ -147,9 +147,9 @@ namespace utue::pe::tools::lsh {
float m_weight; float m_weight;
/** /**
* Pointer to the value of the Token * Value of the Token
*/ */
char* m_value; char m_value[32];
/** /**
* Size of the value * Size of the value
...@@ -164,12 +164,6 @@ namespace utue::pe::tools::lsh { ...@@ -164,12 +164,6 @@ namespace utue::pe::tools::lsh {
*/ */
Token(); Token();
/**
* Copy a token
* @param token to copy
*/
Token(const Token& token);
/** /**
* Create a full initialized Token * Create a full initialized Token
* @param type Type of the Token * @param type Type of the Token
...@@ -267,6 +261,8 @@ namespace utue::pe::tools::lsh { ...@@ -267,6 +261,8 @@ namespace utue::pe::tools::lsh {
* @return String showing the Token variables * @return String showing the Token variables
*/ */
[[nodiscard]] std::string toString() const; [[nodiscard]] std::string toString() const;
static std::string typeToString(const Type& type);
}; };
} }
......
...@@ -168,9 +168,11 @@ namespace utue::pe::utils { ...@@ -168,9 +168,11 @@ namespace utue::pe::utils {
HashSet(const unsigned int& buckets = 100, const unsigned long& seed = 0) : HashSet(const unsigned int& buckets = 100, const unsigned long& seed = 0) :
m_hash(), m_hash(),
m_equal(), m_equal(),
m_buckets(buckets), m_buckets(),
m_active_buckets(), m_active_buckets(),
m_seed(seed) { } m_seed(seed) {
this->m_buckets.resize(buckets);
}
/** /**
* Inserts an element to the HashSet * Inserts an element to the HashSet
...@@ -192,12 +194,24 @@ namespace utue::pe::utils { ...@@ -192,12 +194,24 @@ namespace utue::pe::utils {
return this->m_buckets[i].emplace_back(element); return this->m_buckets[i].emplace_back(element);
} }
[[nodiscard]] bool has(const T& element) {
unsigned long i;
i = this->hash(element);
for (auto& it : this->m_buckets[i])
if (this->m_equal(it, element))
return true;
return false;
}
/** /**
* Returns a iterator container for fast bucket iteration. * Returns a iterator container for fast bucket iteration.
* Only active buckets will be considered. * Only active buckets will be considered.
* @return Iterator container * @return Iterator container
*/ */
Iterator<FastBucketIterator> fastIterator() { [[nodiscard]] Iterator<FastBucketIterator> fastIterator() {
return Iterator<FastBucketIterator>(this->m_active_buckets.begin(),this->m_active_buckets.end()); return Iterator<FastBucketIterator>(this->m_active_buckets.begin(),this->m_active_buckets.end());
} }
...@@ -206,7 +220,7 @@ namespace utue::pe::utils { ...@@ -206,7 +220,7 @@ namespace utue::pe::utils {
* All buckets will be considered, if active or not. * All buckets will be considered, if active or not.
* @return Iterator container * @return Iterator container
*/ */
Iterator<SlowBucketIterator> slowIterator() { [[nodiscard]] Iterator<SlowBucketIterator> slowIterator() {
return Iterator<SlowBucketIterator>(this->m_buckets.begin(),this->m_buckets.end()); return Iterator<SlowBucketIterator>(this->m_buckets.begin(),this->m_buckets.end());
} }
...@@ -215,7 +229,7 @@ namespace utue::pe::utils { ...@@ -215,7 +229,7 @@ namespace utue::pe::utils {
* Only active buckets will be considered. * Only active buckets will be considered.
* @return Iterator container * @return Iterator container
*/ */
Iterator<FastConstBucketIterator> fastIterator() const { [[nodiscard]] Iterator<FastConstBucketIterator> fastIterator() const {
return Iterator<FastConstBucketIterator>(this->m_active_buckets.begin(),this->m_active_buckets.end()); return Iterator<FastConstBucketIterator>(this->m_active_buckets.begin(),this->m_active_buckets.end());
} }
...@@ -224,10 +238,41 @@ namespace utue::pe::utils { ...@@ -224,10 +238,41 @@ namespace utue::pe::utils {
* All buckets will be considered, if active or not. * All buckets will be considered, if active or not.
* @return Iterator container * @return Iterator container
*/ */
Iterator<SlowConstBucketIterator> slowIterator() const { [[nodiscard]] Iterator<SlowConstBucketIterator> slowIterator() const {
return Iterator<SlowConstBucketIterator>(this->m_buckets.begin(),this->m_buckets.end()); return Iterator<SlowConstBucketIterator>(this->m_buckets.begin(),this->m_buckets.end());
} }
/**
* Returns the maximal bucket load
* @return Bucket load
*/
[[nodiscard]] unsigned long maxLoad() const {
unsigned long load;
unsigned long tmp;
load = 0;
for(const auto& bucket : this->m_active_buckets) {
tmp = bucket->size();
if(tmp > load)
load = tmp;
}
return load;
}
/**
* Returns the average bucket load
* @return Bucket load
*/
[[nodiscard]] float avgLoad() const {
float load;
load = 0.0f;
for(const auto& bucket : this->m_active_buckets)
load += bucket->size();
return load / this->m_active_buckets.size();
}
protected: protected:
/** /**
......
...@@ -124,7 +124,7 @@ namespace utue::pe::utils { ...@@ -124,7 +124,7 @@ namespace utue::pe::utils {
* Creates a time measurement relative to a given Tic * Creates a time measurement relative to a given Tic
* @param tic Start timestamp * @param tic Start timestamp
*/ */
Toc(const Tic& tic); explicit Toc(const Tic& tic);
/** /**
* Creates a time measurement relative to the last Tic. * Creates a time measurement relative to the last Tic.
......
...@@ -84,6 +84,6 @@ namespace utue::pe::tools::lsh { ...@@ -84,6 +84,6 @@ namespace utue::pe::tools::lsh {
} }
float Dictionary::frequency(const Token& token) const { float Dictionary::frequency(const Token& token) const {
return float(this->m_tokens(token,0))/float(this->m_max); return float(this->m_tokens(token,0))/float(this->m_count);
} }
} }
\ No newline at end of file
...@@ -58,10 +58,145 @@ ...@@ -58,10 +58,145 @@
#include <memory> #include <memory>
#include <cmath> #include <cmath>
#include <random> #include <utility>
#include <utue/pe/utils/TicToc.h>
#include <sstream>
#include <iomanip>
namespace utue::pe::tools::lsh { namespace utue::pe::tools::lsh {
MinHash::Similarity::ReportEntry::ReportEntry() : m_token(), m_leftFreq(0.0f), m_rightFreq(0.0f), m_dictFreq(0.0f), m_weight(0.0f), m_matches(0.0f) {};
MinHash::Similarity::Similarity(DocumentPtr left, DocumentPtr right, const unsigned int& thresholds, const unsigned int& seed, const unsigned int& hashes, const unsigned int& buckets) :
m_left(std::move(left)),
m_right(std::move(right)),
m_thresholds(thresholds),
m_seed(seed),
m_score(0.0f),
m_count(0.0f),
m_similarity(0.0f),
m_hashes(hashes),
m_buckets(buckets),
m_seconds(0.0f),
m_report() { }
MinHash::Similarity::Similarity() :
m_left(),
m_right(),
m_thresholds(0),
m_seed(0),
m_score(0.0f),
m_count(0.0f),
m_similarity(0.0f),
m_hashes(0),
m_buckets(0),
m_seconds(0.0f),
m_report() { }
std::string MinHash::Similarity::toString(const unsigned int& top, int cw) const {
std::stringstream ss;
std::list<ReportEntry> report;
unsigned int count;
//report.assign(this->m_report.begin(),this->m_report.end());
for(const auto& entry : this->m_report)
report.emplace_back(ReportEntry(entry));
ss << " --------------- R E P O R T ---------------" << std::endl;
ss << " Number of hashes: " << this->m_hashes << std::endl;
ss << " Number of buckets: " << this->m_buckets << std::endl;
ss << " Number of thresholds: " << this->m_thresholds << std::endl;
ss << " Seed: " << this->m_seed << std::endl;
ss << " Runtime: " << this->m_seconds << " seconds" << std::endl;
ss << " Similarity: " << this->m_similarity << std::endl;
ss << " +"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+" << std::endl;
ss << " | " << std::setw(cw*4+9) << std::setfill(' ') << "Token"
<< " | " << std::setw(cw) << std::setfill(' ') << "Left"
<< " | " << std::setw(cw) << std::setfill(' ') << "Right"
<< " | " << std::setw(cw) << std::setfill(' ') << "Dictionary"
<< " | " << std::setw(cw*3+6) << std::setfill(' ') << "MinHash"
<< " | " << std::endl;
ss << " +"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+" << std::endl;
ss << " | " << std::setw(cw) << std::setfill(' ') << "Type"
<< " | " << std::setw(cw) << std::setfill(' ') << "Length"
<< " | " << std::setw(cw) << std::setfill(' ') << "Value"
<< " | " << std::setw(cw) << std::setfill(' ') << "Weight"
<< " | " << std::setw(cw) << std::setfill(' ') << "Frequency"
<< " | " << std::setw(cw) << std::setfill(' ') << "Frequency"
<< " | " << std::setw(cw) << std::setfill(' ') << "Frequency"
<< " | " << std::setw(cw) << std::setfill(' ') << "Matches"
<< " | " << std::setw(cw) << std::setfill(' ') << "Weight"
<< " | " << std::setw(cw) << std::setfill(' ') << "Score"
<< " | " << std::endl;
ss << " +"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+"
<< std::setw(cw+3) << std::setfill('-') << "+" << std::endl;
report.sort(
[](const ReportEntry& left, const ReportEntry& right) -> bool {
return (left.m_matches*left.m_weight) > (right.m_matches*right.m_weight);
});
count = 0;