Commit 5e41bd62 authored by David Geisler's avatar David Geisler

memory optimizations

parent e703635d
......@@ -63,6 +63,13 @@
namespace utue::pe::tools::lsh {
class Dictionary;
/**
* Pointer to a Dictionary object
*/
typedef std::shared_ptr<Dictionary> DictionaryPtr;
/**
* @brief The dictionary contains all tokens and their frequencies found in the corresponding documents.
*/
......@@ -89,18 +96,20 @@ namespace utue::pe::tools::lsh {
*/
unsigned int m_max;
public:
protected:
/**
* Creates an empty dictionary
*/
Dictionary(const unsigned int& buckets);
explicit Dictionary(const unsigned int& buckets);
public:
/**
*
* @param token
*/
virtual void add(const Token& token);
virtual void add(const TokenPtr& token);
/**
* Returns the tokens contained in the Dictionary and their frequency.
......@@ -125,20 +134,22 @@ namespace utue::pe::tools::lsh {
* @param token Token of the requested frequency
* @return relative frequency
*/
[[nodiscard]] float frequency(const Token& token) const;
[[nodiscard]] float frequency(const TokenPtr& token) const;
/**
* Returns the number of occurrences of a token in the documents
* @param token Requested token
* @return number of occurrences
*/
[[nodiscard]] unsigned int count(const Token& token) const;
};
[[nodiscard]] unsigned int count(const TokenPtr& token) const;
/**
* Pointer to a Dictionary object
*/
typedef std::shared_ptr<Dictionary> DictionaryPtr;
/**
* Create a new Dictionary instance
* @param buckets Number of buckets reserved for Tokens
* @return A shared pointer to the created Dictionary instance
*/
[[nodiscard]] static DictionaryPtr create(const unsigned int& buckets);
};
}
#endif //UTUE_PE_TOOLS_LSH_DICTIONARY_H
......@@ -62,6 +62,13 @@
namespace utue::pe::tools::lsh {
class Document;
/**
* Pointer to a Document object
*/
typedef std::shared_ptr<Document> DocumentPtr;
/**
* @brief A document extracts Tokens from a given text and stores them in a own and a shared dictionary.
*/
......@@ -79,7 +86,7 @@ namespace utue::pe::tools::lsh {
*/
ScannerPtr m_scanner;
public:
protected:
/**
* Create a new Document
......@@ -88,23 +95,27 @@ namespace utue::pe::tools::lsh {
*/
Document(DictionaryPtr dictionary, ScannerPtr scanner, const unsigned int& buckets);
public:
/**
* Add a Token to the Document
* @param token Token to add
*/
void add(const Token& token) override;
void add(const TokenPtr& token) override;
/**
* Parse text and add extracted Tokens to the Document
* @param text Text to parse
*/
void scan(const std::string& text);
};
/**
* Pointer to a Document object
*/
typedef std::shared_ptr<Document> DocumentPtr;
/**
* Create a new Document
* @param dictionary The related dictionary
* @param scanner Scanner to extract Tokens from text
*/
[[nodiscard]] static DocumentPtr create(const DictionaryPtr& dictionary, const ScannerPtr& scanner, const unsigned int& buckets);
};
}
#endif //UTUE_PE_TOOLS_LSH_DOCUMENT_H
......@@ -66,11 +66,23 @@
namespace utue::pe::tools::lsh {
class MinHash;
/**
* Pointer to a MinHash object
*/
typedef std::shared_ptr<MinHash> MinHashPtr;
/**
* @brief Implements a MinHash algorithm to estimate the similarity of multiple Documents
*/
class MinHash {
public:
public:
class Similarity;
typedef std::shared_ptr<Similarity> SimilarityPtr;
/**
* @brief Stores the result of one MinHash similarity calculation
......@@ -78,6 +90,10 @@ namespace utue::pe::tools::lsh {
class Similarity {
public:
class ReportEntry;
typedef std::shared_ptr<ReportEntry> ReportEntryPtr;
/**
* @brief Represents a report entry
*/
......@@ -87,7 +103,7 @@ namespace utue::pe::tools::lsh {
/**
* Investigated token
*/
Token m_token;
TokenPtr m_token;
/**
* Frequency of the Token in the left Document
......@@ -114,10 +130,18 @@ namespace utue::pe::tools::lsh {
*/
float m_matches;
protected:
/**
* Constructs an empty ReportEntry
*/
ReportEntry();
public:
/**
* Constructs an empty ReportEntry
* @return Returns a shared pointer to the created ReportEntry instance
*/
[[nodiscard]] static ReportEntryPtr create();
};
/**
......@@ -173,8 +197,9 @@ namespace utue::pe::tools::lsh {
/**
* Report of all investigated Token matches
*/
std::list<ReportEntry> m_report;
std::list<ReportEntryPtr> m_report;
protected:
/**
* Creates a new Similarity object
* @param left Left Document
......@@ -191,7 +216,27 @@ namespace utue::pe::tools::lsh {
*/
Similarity();
public:
[[nodiscard]] std::string toString(const unsigned int& top = 5, int cw = 10) const;
/**
* Constructs an empty Similarity object
* @return Returns a shared pointer to the created Similarity instance
*/
[[nodiscard]] static SimilarityPtr create();
/**
* Creates a new Similarity object
* @param left Left Document
* @param right Right Document
* @param thresholds Number of thresholds
* @param seed Random number generator seed
* @param hashes Number of hashes/permutations
* @param buckets Number of buckets in hash tables
* @return Returns a shared pointer to the created Similarity instance
*/
[[nodiscard]] static SimilarityPtr create(const DocumentPtr& left, const DocumentPtr& right, const unsigned int& thresholds, const unsigned int& seed, const unsigned int& hashes, const unsigned int& buckets);
};
private:
......@@ -224,7 +269,7 @@ namespace utue::pe::tools::lsh {
*/
unsigned int m_buckets;
public:
protected:
/**
* Creates an empty MinHash object
......@@ -233,6 +278,8 @@ namespace utue::pe::tools::lsh {
*/
explicit MinHash(const unsigned int& hashes = 100, const unsigned int& buckets = 500);
public:
/**
* Creates a new document
* @return document Empty document linked to the MinHash Dictionary and Scanner
......@@ -269,13 +316,13 @@ namespace utue::pe::tools::lsh {
* @todo: hide this function from public
* @param token Token to add
*/
void add(const Token& token);
void add(const TokenPtr& token);
/**
* Calculates the similarity of to Documents
* @param similarity Read/Write container with two documents
*/
void similarity(Similarity& similarity) const;
void similarity(SimilarityPtr& similarity) const;
/**
* Calculates the similarity of to Documents
......@@ -285,7 +332,7 @@ namespace utue::pe::tools::lsh {
* @param seed Random number generator seed
* @return Similarity of two Documents
*/
[[nodiscard]] Similarity similarity(const DocumentPtr& left, const DocumentPtr& right, const unsigned int& thresholds = 10, const unsigned int& seed = 0) const;
[[nodiscard]] SimilarityPtr similarity(const DocumentPtr& left, const DocumentPtr& right, const unsigned int& thresholds = 10, const unsigned int& seed = 0) const;
/**
* Calculates the similarity of to string
......@@ -295,7 +342,7 @@ namespace utue::pe::tools::lsh {
* @param seed Random number generator seed
* @return Similarity of two strings
*/
[[nodiscard]] Similarity similarity(const std::string& left, const std::string& right, const unsigned int& thresholds = 10, const unsigned int& seed = 0);
[[nodiscard]] SimilarityPtr similarity(const std::string& left, const std::string& right, const unsigned int& thresholds = 10, const unsigned int& seed = 0);
/**
* Add a Scanner to the MinHash
......@@ -327,13 +374,8 @@ namespace utue::pe::tools::lsh {
* @param buckets Number of buckets in the chained hash set of Tokens
* @return Pointer to the created MinHash object
*/
[[nodiscard]] static std::shared_ptr<MinHash> create(const unsigned int& hashes = 1000, const unsigned int& buckets = 500);
[[nodiscard]] static MinHashPtr create(const unsigned int& hashes = 1000, const unsigned int& buckets = 500);
};
/**
* Pointer to a MinHash object
*/
typedef std::shared_ptr<MinHash> MinHashPtr;
}
#endif //UTUE_PE_TOOLS_LSH_MINHASH_H
......@@ -95,7 +95,7 @@ namespace utue::pe::tools::lsh {
* @param text Text to scan
* @param callback Callback function gets called for each extracted Token
*/
virtual void scan(const std::string& text, std::function<void (const Token&)> callback) = 0;
virtual void scan(const std::string& text, std::function<void (const TokenPtr&)> callback) = 0;
/**
* Creates a Scanner for a specific Token type.
......
......@@ -61,12 +61,23 @@
namespace utue::pe::tools::lsh {
class SubsMatch;
/**
* Pointer to a SubsMatch object
*/
typedef std::shared_ptr<SubsMatch> SubsMatchPtr;
/**
* @brief Implements a SubsMatch algorithm to estimate the similarity of multiple Documents
*/
class SubsMatch {
public:
class Similarity;
typedef std::shared_ptr<Similarity> SimilarityPtr;
/**
* @brief Stores the result of one MinHash similarity calculation
*/
......@@ -93,6 +104,7 @@ namespace utue::pe::tools::lsh {
*/
float m_seconds;
protected:
/**
* Creates a new Similarity object
* @param left Left Document
......@@ -105,7 +117,22 @@ namespace utue::pe::tools::lsh {
*/
Similarity();
public:
[[nodiscard]] std::string toString() const;
/**
* Constructs an empty Similarity object
* @return Returns a shared pointer to the created Similarity instance
*/
[[nodiscard]] static SimilarityPtr create();
/**
* Creates a new Similarity object
* @param left Left Document
* @param right Right Document
* @return Returns a shared pointer to the created Similarity instance
*/
[[nodiscard]] static SimilarityPtr create(const DocumentPtr& left, const DocumentPtr& right);
};
private:
......@@ -122,7 +149,7 @@ namespace utue::pe::tools::lsh {
*/
ScannerPtr m_scanner;
public:
protected:
/**
* Creates an empty SubsMatch object
......@@ -130,6 +157,7 @@ namespace utue::pe::tools::lsh {
*/
explicit SubsMatch();
public:
/**
* Creates a new document
* @return document Empty document linked to the MinHash Dictionary and Scanner
......@@ -164,7 +192,7 @@ namespace utue::pe::tools::lsh {
* Calculates the similarity of to Documents
* @param similarity Read/Write container with two documents
*/
void similarity(Similarity& similarity) const;
void similarity(SimilarityPtr& similarity) const;
/**
* Calculates the similarity of to Documents
......@@ -172,7 +200,7 @@ namespace utue::pe::tools::lsh {
* @param right The second Document
* @return Similarity of two Documents
*/
[[nodiscard]] Similarity similarity(const DocumentPtr& left, const DocumentPtr& right) const;
[[nodiscard]] SimilarityPtr similarity(const DocumentPtr& left, const DocumentPtr& right) const;
/**
* Calculates the similarity of to string
......@@ -180,7 +208,7 @@ namespace utue::pe::tools::lsh {
* @param right The second string
* @return Similarity of two strings
*/
[[nodiscard]] Similarity similarity(const std::string& left, const std::string& right);
[[nodiscard]] SimilarityPtr similarity(const std::string& left, const std::string& right);
/**
* Add a Scanner to the MinHash
......@@ -198,13 +226,8 @@ namespace utue::pe::tools::lsh {
* Creates a SubsMatch object
* @return Pointer to the created SubsMatch object
*/
[[nodiscard]] static std::shared_ptr<SubsMatch> create();
[[nodiscard]] static SubsMatchPtr create();
};
/**
* Pointer to a SubsMatch object
*/
typedef std::shared_ptr<SubsMatch> SubsMatchPtr;
}
#endif //UTUE_PE_TOOLS_LSH_SUBSMATCH_H
......@@ -61,6 +61,10 @@
namespace utue::pe::tools::lsh {
class Token;
typedef std::shared_ptr<Token> TokenPtr;
/**
* A Token represents any kind of semantic meaning in a string
*/
......@@ -108,26 +112,26 @@ namespace utue::pe::tools::lsh {
* @param key Token to calculate hash
* @return Hash of the token
*/
unsigned long operator()(const Token &key, const unsigned long& seed) const;
unsigned long operator()(const TokenPtr &key, const unsigned long& seed) const;
/**
* Compares two Tokens without using the hash
* @param left First Token
* @param right Second Token
* @return True if both objects are the same
*/
bool operator()(const Token& left, const Token& right) const;
bool operator()(const TokenPtr& left, const TokenPtr& right) const;
};
/**
* HashMap with a Token key
*/
template <class ValueType>
using HashMap = utils::HashMap<Token, ValueType, Token::Hash>;
using HashMap = utils::HashMap<TokenPtr, ValueType, Token::Hash>;
/**
* Hash Set with a Token key
*/
typedef utils::HashSet<Token,Token::Hash> HashSet;
typedef utils::HashSet<TokenPtr,Token::Hash> HashSet;
private:
......@@ -151,13 +155,13 @@ namespace utue::pe::tools::lsh {
*/
unsigned int m_length;
public:
protected:
/**
* Creates an empty Token
* The type is initialized to TOKEN, the weight to 1.0f, and the value to null
*/
Token();
//Token();
/**
* Create a full initialized Token
......@@ -168,6 +172,7 @@ namespace utue::pe::tools::lsh {
*/
Token(const Type& type, const char* value, const unsigned int& length, const float& weight = 1.0f);
public:
/**
* Destroy the Token
*/
......@@ -251,13 +256,35 @@ namespace utue::pe::tools::lsh {
*/
[[nodiscard]] bool equal(const Token& token) const;
/**
* Compares the Token to another one
* @param token Token to compare with
* @return True if both Tokens are equal
*/
[[nodiscard]] bool equal(const TokenPtr& token) const;
/**
* Prints the Token variables as string
* @return String showing the Token variables
*/
[[nodiscard]] std::string toString() const;
/**
* Translates the Token type to a readable string
* @param type Type of the Token
* @return String
*/
static std::string typeToString(const Type& type);
/**
* Create a Token instance
* @param type Type of the Token
* @param value Value of the Token in bytes
* @param length Length of the Token value
* @param weight Weight of the Token
* @return Shared pointer to the Token instance
*/
static TokenPtr create(const Type& type, const char* value, const unsigned int& length, const float& weight = 1.0f);
};
}
......
......@@ -61,7 +61,7 @@ namespace utue::pe::tools::lsh {
}
void Dictionary::add(const Token& token) {
void Dictionary::add(const TokenPtr& token) {
unsigned int count;
count = this->m_tokens(token,0)+=1;
......@@ -83,11 +83,15 @@ namespace utue::pe::tools::lsh {
return this->m_max;
}
float Dictionary::frequency(const Token& token) const {
float Dictionary::frequency(const TokenPtr& token) const {
return float(this->m_tokens(token,0))/float(this->m_count);
}
unsigned int Dictionary::count(const Token& token) const {
unsigned int Dictionary::count(const TokenPtr& token) const {
return this->m_tokens(token,0);
}
DictionaryPtr Dictionary::create(const unsigned int& buckets) {
return DictionaryPtr(new Dictionary(buckets));
}
}
\ No newline at end of file
......@@ -66,14 +66,18 @@ namespace utue::pe::tools::lsh {
}
void Document::add(const Token& token) {
void Document::add(const TokenPtr& token) {
Dictionary::add(token);
this->m_dictionary->add(token);
}
void Document::scan(const std::string& text) {
this->m_scanner->scan(text,[&](const Token& token){
this->m_scanner->scan(text,[&](const TokenPtr& token){
this->add(token);
});
}
DocumentPtr Document::create(const DictionaryPtr& dictionary, const ScannerPtr& scanner, const unsigned int& buckets) {
return DocumentPtr(new Document(dictionary,scanner,buckets));
}
}
\ No newline at end of file
......@@ -65,7 +65,11 @@
namespace utue::pe::tools::lsh {
MinHash::Similarity::ReportEntry::ReportEntry() : m_token(), m_leftFreq(0.0f), m_rightFreq(0.0f), m_dictFreq(0.0f), m_weight(0.0f), m_matches(0.0f) {};
MinHash::Similarity::ReportEntry::ReportEntry() : m_token(), m_leftFreq(0.0f), m_rightFreq(0.0f), m_dictFreq(0.0f), m_weight(0.0f), m_matches(0.0f) {}
MinHash::Similarity::ReportEntryPtr MinHash::Similarity::ReportEntry::create() {
return ReportEntryPtr(new ReportEntry());
};
MinHash::Similarity::Similarity(DocumentPtr left, DocumentPtr right, const unsigned int& thresholds, const unsigned int& seed, const unsigned int& hashes, const unsigned int& buckets) :
m_left(std::move(left)),
......@@ -95,13 +99,10 @@ namespace utue::pe::tools::lsh {
std::string MinHash::Similarity::toString(const unsigned int& top, int cw) const {
std::stringstream ss;
std::list<ReportEntry> report;
std::list<ReportEntryPtr> report;
unsigned int count;
//report.assign(this->m_report.begin(),this->m_report.end());
for(const auto& entry : this->m_report)
report.emplace_back(ReportEntry(entry));
report.assign(this->m_report.begin(),this->m_report.end());
ss << " --------------- R E P O R T ---------------" << std::endl;
ss << " Number of hashes: " << this->m_hashes << std::endl;
......@@ -162,22 +163,22 @@ namespace utue::pe::tools::lsh {
<< std::setw(cw+3) << std::setfill('-') << "+" << std::endl;
report.sort(
[](const ReportEntry& left, const ReportEntry& right) -> bool {
return (left.m_matches*left.m_weight) > (right.m_matches*right.m_weight);
[](const ReportEntryPtr& left, const ReportEntryPtr& right) -> bool {
return (left->m_matches*left->m_weight) > (right->m_matches*right->m_weight);
});
count = 0;
for(const auto& entry : report) {
count++;
ss << " | " << std::setw(cw+0) << std::setfill(' ') << Token::typeToString(entry.m_token.type())
<< " | " << std::setw(cw+0) << std::setfill(' ') << entry.m_token.length()
<< " | " << std::setw(cw+0) << std::setfill(' ') << ("\"" + std::string(entry.m_token.value(),entry.m_token.length()) + "\"")
<< " | " << std::setw(cw+0) << std::setfill(' ') << std::fixed << std::setprecision( 2 )<< entry.m_token.weight()
<< " | " << std::setw(cw-1) << std::setfill(' ') << std::fixed << std::setprecision( 2 )<< 100.0f*entry.m_leftFreq << "%"
<< " | " << std::setw(cw-1) << std::setfill(' ') << std::fixed << std::setprecision( 2 )<< 100.0f*entry.m_rightFreq << "%"
<< " | " << std::setw(cw-1) << std::setfill(' ') << std::fixed << std::setprecision( 2 )<< 100.0f*entry.m_dictFreq << "%"
<< " | " << std::setw(cw-1) << std::setfill(' ') << std::fixed << std::setprecision( 2 )<< 100.0f*entry.m_matches << "%"
<< " | " << std::setw(cw-1) << std::setfill(' ') << std::fixed << std::setprecision( 2 )<< 100.0f*entry.m_weight/this->m_count << "%"
<< " | " << std::setw(cw-1) << std::setfill(' ') << std::fixed << std::setprecision( 2 )<< 100.0f*entry.m_matches*entry.m_weight/this->m_count << "%"
ss << " | " << std::setw(cw+0) << std::setfill(' ') << Token::typeToString(entry->m_token->type())
<< " | " << std::setw(cw+0) << std::setfill(' ') << entry->m_token->length()
<< " | " << std::setw(cw+0) << std::setfill(' ') << ("\"" + std::string(entry->m_token->value(),entry->m_token->length()) + "\"")
<< " | " << std::setw(cw+0) << std::setfill(' ') << std::fixed << std::setprecision( 2 )<< entry->m_token->weight()
<< " | " << std::setw(cw-1) << std::setfill(' ') << std::fixed << std::setprecision( 2 )<< 100.0f*entry->m_leftFreq << "%"
<< " | " << std::setw(cw-1) << std::setfill(' ') << std::fixed << std::setprecision( 2 )<< 100.0f*entry->m_rightFreq << "%"
<< " | " <<