Commit a081d3cd authored by David Geisler's avatar David Geisler

First working version

parent da34f779
# pe-tools-lsh
Scanpath similarity using locality sensitive hashing
\ No newline at end of file
Scanpath similarity using locality sensitive hashing.
The actual algorithm can be found at:
```source/utue/pe/tools/lsh/MinHash.cpp.```
an example program can be found at: ```source/utue/pe/tools/lsh/example/MinHash.cpp```
## Build
There is a cmake script to build the example program:
```bash
$ mkdir build
$ cd build
$ cmake ..
$ make all
```
Note that the project uses git submodules. If you haven't checked them out yet, do it like this:
```bash
$ git submodule update --init
```
......@@ -49,22 +49,29 @@
*
* @package utue.pe.tools.lsh
*
* @todo add brief description
* @brief Defines a Dictionary type. A dictionary stores tokens in a HashMap and counts their frequency. For example,
* several documents can create a Dictionary and use it to create a common feature space. At the same time, documents
* can also determine their own feature space using a non shared dictionary. This enables to determine Token subsets
* over several documents.
*/
#ifndef UTUE_PE_TOOLS_LSH_DOCUMENT_H
#define UTUE_PE_TOOLS_LSH_DOCUMENT_H
#ifndef UTUE_PE_TOOLS_LSH_DICTIONARY_H
#define UTUE_PE_TOOLS_LSH_DICTIONARY_H
#include <sstream>
#include <vector>
#include <algorithm>
#include <random>
#include <utue/pe/tools/lsh/Token.h>
#include <memory>
namespace utue::pe::tools::lsh {
class Document {
/**
* @brief The dictionary contains all tokens and their frequencies found in the corresponding documents.
*/
class Dictionary {
private:
/**
* Hashmap which stores the frequency for each found token
*/
typedef Token::HashMap<unsigned int> TokenFreq;
/**
......@@ -82,15 +89,49 @@ namespace utue::pe::tools::lsh {
*/
unsigned int m_max;
public:
/**
* Creates an empty dictionary
*/
Dictionary();
/**
*
* @param token
*/
virtual void add(const Token& token);
public:
Document();
/**
* Returns the tokens contained in the Dictionary and their frequency.
* @return Hashmap of all tokens and frequency
*/
[[nodiscard]] const TokenFreq& tokens() const;
/**
* Returns the number of all tokens found.
* @return Number of found tokens
*/
[[nodiscard]] const unsigned int& count() const;
void add(const Token& token);
/**
* Returns the number of most frequent token.
* @return Number of the most frequent token
*/
[[nodiscard]] const unsigned int& max() const;
void add(const std::string& text);
/**
* Calculates the relative frequency of a token, measured by the most frequently occurring token.
* @param token Token of the requested frequency
* @return relative frequency
*/
[[nodiscard]] float frequency(const Token& token) const;
};
/**
* Pointer to a Dictionary object
*/
typedef std::shared_ptr<Dictionary> DictionaryPtr;
}
#endif //UTUE_PE_TOOLS_LSH_DOCUMENT_H
#endif //UTUE_PE_TOOLS_LSH_DICTIONARY_H
......@@ -49,7 +49,9 @@
*
* @package utue.pe.tools.lsh
*
* @todo add brief description
* @brief Defines a Document type. A Document uses a Scanner to extract tokens from its text and saves them in
* different dictionaries.
* @see Dictionary
*/
#ifndef UTUE_PE_TOOLS_LSH_DOCUMENT_H
......@@ -60,40 +62,48 @@
namespace utue::pe::tools::lsh {
/**
* @brief A document extracts Tokens from a given text and stores them in a own and a shared dictionary.
*/
class Document : public Dictionary {
private:
/**
*
* The shared dictionary
* @see Dictionary
*/
DictionaryPtr m_dictionary;
/**
*
* Scanner to extract Tokens from text
* @see Scanner
*/
ScannerPtr m_scanner;
public:
/**
*
* @param dictionary
* @param scanner
* Create a new Document
* @param dictionary The related dictionary
* @param scanner Scanner to extract Tokens from text
*/
Document(DictionaryPtr dictionary, ScannerPtr scanner);
/**
*
* @param token
* Add a Token to the Document
* @param token Token to add
*/
void add(const Token& token) override;
/**
*
* @param text
* Parse text and add extracted Tokens to the Document
* @param text Text to parse
*/
void scan(const std::string& text);
};
/**
* Pointer to a Document object
*/
typedef std::shared_ptr<Document> DocumentPtr;
}
......
......@@ -60,6 +60,7 @@
#define UTUE_PE_TOOLS_LSH_MINHASH_H
#include <utue/pe/tools/lsh/Document.h>
#include <utue/pe/utils/HashSet.h>
#include <memory>
#include <list>
......@@ -70,26 +71,6 @@ namespace utue::pe::tools::lsh {
*/
class MinHash {
private:
/**
* @brief Very basic implementation of a chained hash set
*/
class TokenHashSet {
public:
typedef std::vector<std::list<Token>>::const_iterator BucketIterator;
private:
std::vector<std::list<Token>> m_buckets;
unsigned long m_size;
unsigned long m_seed;
public:
TokenHashSet(const unsigned int& buckets, const unsigned long& seed);
void insert(const Token& token);
BucketIterator begin() const;
BucketIterator end() const;
};
/**
* Dictionary of all related Documents
* @see Dictionary
......@@ -106,7 +87,7 @@ namespace utue::pe::tools::lsh {
* List of hash sets of Tokens. Each hash set was calculated by a different hash function. This results in
* different permutations of the tokens.
*/
std::vector<TokenHashSet> m_permutations;
std::vector<Token::HashSet> m_permutations;
public:
......
......@@ -49,7 +49,7 @@
*
* @package utue.pe.tools.lsh
*
* @todo add brief description
* @brief Defines an NGRAM Token Scanner
*/
#ifndef UTUE_PE_TOOLS_LSH_NGRAM_H
......@@ -59,17 +59,12 @@
namespace utue::pe::tools::lsh {
class NGram : public Token {
public:
NGram();
};
/**
*
* Creates a NGRAM Token Scanner
* @param weight weight of the extracted ngram
* @param length length of the ngram
* @param leaks maximum length of allowed gaps
* @return
* @return Pointer to the created Scanner object
*/
template <>
ScannerPtr Scanner::create<Token::NGRAM>(const float& weight, int length, int leaks);
......
......@@ -49,116 +49,70 @@
*
* @package utue.pe.tools.lsh
*
* @todo add brief description
* @brief Defines a abstract Scanner to extract tokens of a certain type from text.
*/
#ifndef UTUE_PE_TOOLS_LSH_TOKEN_H
#define UTUE_PE_TOOLS_LSH_TOKEN_H
#ifndef UTUE_PE_TOOLS_LSH_SCANNER_H
#define UTUE_PE_TOOLS_LSH_SCANNER_H
#include <functional>
#include <unordered_map>
#include <unordered_set>
#include <boost/shared_ptr.hpp>
#include <memory>
#include <utue/pe/tools/lsh/Token.h>
namespace utue::pe::tools::lsh {
/**
* @todo add brief description
* @brief Extracts Tokens from a given string
*/
class Token {
public:
class Scanner {
private:
/**
* @todo add brief description
* Weight of the extracted Tokens
*/
enum Type : unsigned char {
TOKEN = 0x00,
WORD = 0x10,
NGRAM = 0x11,
INTEGER = 0x20,
FLOAT = 0x30
};
float m_weight;
public:
/**
* @todo add brief description
* Creates a new Scanner object
* @param weight Weight of the extracted Tokens
*/
class Hash {
public:
unsigned long operator()(const Token &key) const;
};
explicit Scanner(const float& weight = 1.0f);
/**
* @todo add brief description
* Returns the weight the scanner assigns to the tokens.
* @return Weight of the Tokens
*/
class Equal {
public:
bool operator()(const Token& left, const Token& right) const;
};
[[nodiscard]] const float& weight() const;
/**
* @todo add brief description
* Set the weight which is set to the Tokens extracted by the Scanner
* @param Weight of the Tokens
*/
class Scanner {
private:
float m_weight;
public:
explicit Scanner(const float& weight = 1.0f);
[[nodiscard]] const float& weight() const;
void weight(const float& weight);
virtual void scan(const std::string& text, std::function<void (const Token&)> callback) = 0;
template<Type TokenType, typename... args>
static std::shared_ptr<Scanner> create(const float& weight, args...);
};
typedef std::shared_ptr<Scanner> ScannerPtr;
template <class ValueType>
using HashMap = std::unordered_map<Token, ValueType, Token::Hash, Token::Equal>;
using HashSet = std::unordered_set<Token, Token::Hash, Token::Equal>;
private:
Type m_type;
float m_weight;
char* m_value;
unsigned int m_length;
public:
Token();
Token(const Token& token);
Token(const Type& type, const void* value, const unsigned int& length, const float& weight = 1.0f);
virtual ~Token();
[[nodiscard]] Type& type();
[[nodiscard]] const Type& type() const;
void type(const Type& type);
[[nodiscard]] float& weight();
[[nodiscard]] const float& weight() const;
void weight(const float& weight);
[[nodiscard]] void* value();
[[nodiscard]] const void* value() const;
[[nodiscard]] unsigned int& length();
[[nodiscard]] const unsigned int& length() const;
void value(const void* value, const unsigned int& length);
[[nodiscard]] unsigned long hash() const;
[[nodiscard]] unsigned long hash(const unsigned long& seed) const;
[[nodiscard]] bool equal(const Token& token) const;
/**
* Extract the Tokens of a text
* @param text Text to scan
* @param callback Callback function gets called for each extracted Token
*/
virtual void scan(const std::string& text, std::function<void (const Token&)> callback) = 0;
[[nodiscard]] std::string toString() const;
/**
* Creates a Scanner for a specific Token type.
* @tparam TokenType Type of the Token to be extracted by the created Scanner.
* @tparam args Argument types.
* @param weight Weight of the Tokens extracted by the Scanner.
* @param ... Additional arguments to create the Scanner object
* @return Pointer to the created Scanner object
*/
template<Token::Type TokenType, typename... args>
static std::shared_ptr<Scanner> create(const float& weight, args...);
};
/**
* Pointer to a Scanner object
*/
typedef std::shared_ptr<Scanner> ScannerPtr;
}
#endif //UTUE_PE_TOOLS_LSH_TOKEN_H
#endif //UTUE_PE_TOOLS_LSH_SCANNER_H
......@@ -49,88 +49,223 @@
*
* @package utue.pe.tools.lsh
*
* @todo add brief description
* @brief Defines a token object. A token object represents a semantic unit of a text.
*/
#ifndef UTUE_PE_TOOLS_LSH_TOKEN_H
#define UTUE_PE_TOOLS_LSH_TOKEN_H
#include <unordered_map>
#include <utue/pe/utils/HashMap.h>
namespace utue::pe::tools::lsh {
/**
* @todo add brief description
* A Token represents any kind of semantic meaning in a string
*/
class Token {
public:
/**
* @todo add brief description
* @brief Types of different Tokens
*/
enum Type : unsigned char {
/**
* Generic type for an unspecified Token
*/
TOKEN = 0x00,
/**
* A token representing a word
*/
WORD = 0x10,
/**
* A token extracted from a string using a fixed window size.
*/
NGRAM = 0x11,
/**
* Not implemented
*/
INTEGER = 0x20,
/**
* Not implemented
*/
FLOAT = 0x30
};
/**
* @todo add brief description
* @brief Provides a hash function for std::unordered_map
*/
class Hash {
public:
/**
* Calculates the hash of a Token
* @param key Token to calculate hash
* @return Hash of the token
*/
unsigned long operator()(const Token &key) const;
};
/**
* @todo add brief description
* @brief Provides a comparator for std::unordered_map
*/
class Equal {
public:
/**
* Compares two Tokens without using the hash
* @param left First Token
* @param right Second Token
* @return True if both objects are the same
*/
bool operator()(const Token& left, const Token& right) const;
};
/**
* HashMap with a Token key
*/
template <class ValueType>
using HashMap = std::unordered_map<Token, ValueType, Token::Hash, Token::Equal>;
using HashMap = utils::HashMap<Token, ValueType, Token::Hash, Token::Equal>;
/**
* Hash Set with a Token key
*/
typedef utils::HashSet<Token,Token::Hash,Token::Equal> HashSet;
private:
/**
* Type of the Token
*/
Type m_type;
/**
* Weight of the Token
*/
float m_weight;
/**
* Pointer to the value of the Token
*/
char* m_value;
/**
* Size of the value
*/
unsigned int m_length;
public:
/**
* Creates an empty Token
* The type is initialized to TOKEN, the weight to 1.0f, and the value to null
*/
Token();
/**
* Copy a token
* @param token to copy
*/
Token(const Token& token);
Token(const Type& type, const void* value, const unsigned int& length, const float& weight = 1.0f);
/**
* Create a full initialized Token
* @param type Type of the Token
* @param value Value of the Token
* @param length Size of the Value
* @param weight Weight od the Token
*/
Token(const Type& type, const char* value, const unsigned int& length, const float& weight = 1.0f);
/**
* Destroy the Token
*/
virtual ~Token();
/**
* Returns the Type of the Token
* @return Type of Token
*/
[[nodiscard]] Type& type();
/**
* Returns the Type of the Token
* @return Type of Token
*/
[[nodiscard]] const Type& type() const;
/**
* Set the Type of the Token
* @param type Type of the Token
*/
void type(const Type& type);
/**
* Returns the weight of the Token
* @return Weight of the Token
*/
[[nodiscard]] float& weight();
/**
* Returns the weight of the Token
* @return Weight of the Token
*/
[[nodiscard]] const float& weight() const;
/**
* Set the weight of the Token
* @param weight Weight of the Token
*/
void weight(const float& weight);
[[nodiscard]] void* value();
[[nodiscard]] const void* value() const;
/**
* Returns the pointer to the Token value
* @return Value of the Token
*/
[[nodiscard]] char* value();
/**
* Returns the pointer to the Token value
* @return Value of the Token
*/
[[nodiscard]] const char* value() const;
/**
* Returns the size of the Token value
* @return Size of the Token value
*/
[[nodiscard]] unsigned int& length();
/**
* Returns the size of the Token value
* @return Size of the Token value
*/
[[nodiscard]] const unsigned int& length() const;
void value(const void* value, const unsigned int& length);
/**
* Set the value of the Token
* @param value New value of the Token
* @param length Size of the new Token value