Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
David Geisler
pe-tools-lsh
Commits
a081d3cd
Commit
a081d3cd
authored
Oct 23, 2019
by
David Geisler
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
First working version
parent
da34f779
Changes
28
Hide whitespace changes
Inline
Side-by-side
Showing
28 changed files
with
1483 additions
and
511 deletions
+1483
-511
README.md
README.md
+20
-1
include/utue/pe/tools/lsh/Dictionary.h
include/utue/pe/tools/lsh/Dictionary.h
+54
-13
include/utue/pe/tools/lsh/Document.h
include/utue/pe/tools/lsh/Document.h
+20
-10
include/utue/pe/tools/lsh/MinHash.h
include/utue/pe/tools/lsh/MinHash.h
+2
-21
include/utue/pe/tools/lsh/NGram.h
include/utue/pe/tools/lsh/NGram.h
+3
-8
include/utue/pe/tools/lsh/Scanner.h
include/utue/pe/tools/lsh/Scanner.h
+41
-87
include/utue/pe/tools/lsh/Token.h
include/utue/pe/tools/lsh/Token.h
+147
-12
include/utue/pe/tools/lsh/Word.h
include/utue/pe/tools/lsh/Word.h
+4
-11
include/utue/pe/utils/HashMap.h
include/utue/pe/utils/HashMap.h
+78
-56
include/utue/pe/utils/HashSet.h
include/utue/pe/utils/HashSet.h
+93
-18
include/utue/pe/utils/Random.h
include/utue/pe/utils/Random.h
+86
-6
include/utue/pe/utils/TicToc.h
include/utue/pe/utils/TicToc.h
+101
-22
source/utue/CMakeLists.txt
source/utue/CMakeLists.txt
+0
-2
source/utue/pe/CMakeLists.txt
source/utue/pe/CMakeLists.txt
+0
-2
source/utue/pe/tools/CMakeLists.txt
source/utue/pe/tools/CMakeLists.txt
+0
-2
source/utue/pe/tools/lsh/CMakeLists.txt
source/utue/pe/tools/lsh/CMakeLists.txt
+2
-1
source/utue/pe/tools/lsh/Dictionary.cpp
source/utue/pe/tools/lsh/Dictionary.cpp
+18
-22
source/utue/pe/tools/lsh/Document.cpp
source/utue/pe/tools/lsh/Document.cpp
+78
-3
source/utue/pe/tools/lsh/MinHash.cpp
source/utue/pe/tools/lsh/MinHash.cpp
+128
-13
source/utue/pe/tools/lsh/NGram.cpp
source/utue/pe/tools/lsh/NGram.cpp
+108
-3
source/utue/pe/tools/lsh/Scanner.cpp
source/utue/pe/tools/lsh/Scanner.cpp
+4
-130
source/utue/pe/tools/lsh/Token.cpp
source/utue/pe/tools/lsh/Token.cpp
+183
-3
source/utue/pe/tools/lsh/Word.cpp
source/utue/pe/tools/lsh/Word.cpp
+101
-3
source/utue/pe/tools/lsh/example/CMakeLists.txt
source/utue/pe/tools/lsh/example/CMakeLists.txt
+2
-1
source/utue/pe/tools/lsh/example/MinHash.cpp
source/utue/pe/tools/lsh/example/MinHash.cpp
+128
-3
source/utue/pe/utils/CMakeLists.txt
source/utue/pe/utils/CMakeLists.txt
+1
-4
source/utue/pe/utils/Random.cpp
source/utue/pe/utils/Random.cpp
+51
-16
source/utue/pe/utils/TicToc.cpp
source/utue/pe/utils/TicToc.cpp
+30
-38
No files found.
README.md
View file @
a081d3cd
# pe-tools-lsh
Scanpath similarity using locality sensitive hashing
\ No newline at end of file
Scanpath similarity using locality sensitive hashing.
The actual algorithm can be found at:
```
source/utue/pe/tools/lsh/MinHash.cpp.```
an example program can be found at: ```source/utue/pe/tools/lsh/example/MinHash.cpp```
## Build
There is a cmake script to build the example program:
```
bash
$ mkdir build
$ cd build
$ cmake ..
$ make all
```
Note that the project uses git submodules. If you haven't checked them out yet, do it like this:
```
bash
$ git submodule update --init
```
include/utue/pe/tools/lsh/Dictionary.h
View file @
a081d3cd
...
...
@@ -49,22 +49,29 @@
*
* @package utue.pe.tools.lsh
*
* @todo add brief description
* @brief Defines a Dictionary type. A dictionary stores tokens in a HashMap and counts their frequency. For example,
* several documents can create a Dictionary and use it to create a common feature space. At the same time, documents
* can also determine their own feature space using a non shared dictionary. This enables to determine Token subsets
* over several documents.
*/
#ifndef UTUE_PE_TOOLS_LSH_D
OCUMENT
_H
#define UTUE_PE_TOOLS_LSH_D
OCUMENT
_H
#ifndef UTUE_PE_TOOLS_LSH_D
ICTIONARY
_H
#define UTUE_PE_TOOLS_LSH_D
ICTIONARY
_H
#include <sstream>
#include <vector>
#include <algorithm>
#include <random>
#include <utue/pe/tools/lsh/Token.h>
#include <memory>
namespace
utue
::
pe
::
tools
::
lsh
{
class
Document
{
/**
* @brief The dictionary contains all tokens and their frequencies found in the corresponding documents.
*/
class
Dictionary
{
private:
/**
* Hashmap which stores the frequency for each found token
*/
typedef
Token
::
HashMap
<
unsigned
int
>
TokenFreq
;
/**
...
...
@@ -82,15 +89,49 @@ namespace utue::pe::tools::lsh {
*/
unsigned
int
m_max
;
public:
/**
* Creates an empty dictionary
*/
Dictionary
();
/**
*
* @param token
*/
virtual
void
add
(
const
Token
&
token
);
public:
Document
();
/**
* Returns the tokens contained in the Dictionary and their frequency.
* @return Hashmap of all tokens and frequency
*/
[[
nodiscard
]]
const
TokenFreq
&
tokens
()
const
;
/**
* Returns the number of all tokens found.
* @return Number of found tokens
*/
[[
nodiscard
]]
const
unsigned
int
&
count
()
const
;
void
add
(
const
Token
&
token
);
/**
* Returns the number of most frequent token.
* @return Number of the most frequent token
*/
[[
nodiscard
]]
const
unsigned
int
&
max
()
const
;
void
add
(
const
std
::
string
&
text
);
/**
* Calculates the relative frequency of a token, measured by the most frequently occurring token.
* @param token Token of the requested frequency
* @return relative frequency
*/
[[
nodiscard
]]
float
frequency
(
const
Token
&
token
)
const
;
};
/**
* Pointer to a Dictionary object
*/
typedef
std
::
shared_ptr
<
Dictionary
>
DictionaryPtr
;
}
#endif //UTUE_PE_TOOLS_LSH_D
OCUMENT
_H
#endif //UTUE_PE_TOOLS_LSH_D
ICTIONARY
_H
include/utue/pe/tools/lsh/Document.h
View file @
a081d3cd
...
...
@@ -49,7 +49,9 @@
*
* @package utue.pe.tools.lsh
*
* @todo add brief description
* @brief Defines a Document type. A Document uses a Scanner to extract tokens from its text and saves them in
* different dictionaries.
* @see Dictionary
*/
#ifndef UTUE_PE_TOOLS_LSH_DOCUMENT_H
...
...
@@ -60,40 +62,48 @@
namespace
utue
::
pe
::
tools
::
lsh
{
/**
* @brief A document extracts Tokens from a given text and stores them in a own and a shared dictionary.
*/
class
Document
:
public
Dictionary
{
private:
/**
*
* The shared dictionary
* @see Dictionary
*/
DictionaryPtr
m_dictionary
;
/**
*
* Scanner to extract Tokens from text
* @see Scanner
*/
ScannerPtr
m_scanner
;
public:
/**
*
* @param dictionary
* @param scanner
*
Create a new Document
* @param dictionary
The related dictionary
* @param scanner
Scanner to extract Tokens from text
*/
Document
(
DictionaryPtr
dictionary
,
ScannerPtr
scanner
);
/**
*
* @param token
*
Add a Token to the Document
* @param token
Token to add
*/
void
add
(
const
Token
&
token
)
override
;
/**
*
* @param text
*
Parse text and add extracted Tokens to the Document
* @param text
Text to parse
*/
void
scan
(
const
std
::
string
&
text
);
};
/**
* Pointer to a Document object
*/
typedef
std
::
shared_ptr
<
Document
>
DocumentPtr
;
}
...
...
include/utue/pe/tools/lsh/MinHash.h
View file @
a081d3cd
...
...
@@ -60,6 +60,7 @@
#define UTUE_PE_TOOLS_LSH_MINHASH_H
#include <utue/pe/tools/lsh/Document.h>
#include <utue/pe/utils/HashSet.h>
#include <memory>
#include <list>
...
...
@@ -70,26 +71,6 @@ namespace utue::pe::tools::lsh {
*/
class
MinHash
{
private:
/**
* @brief Very basic implementation of a chained hash set
*/
class
TokenHashSet
{
public:
typedef
std
::
vector
<
std
::
list
<
Token
>>::
const_iterator
BucketIterator
;
private:
std
::
vector
<
std
::
list
<
Token
>>
m_buckets
;
unsigned
long
m_size
;
unsigned
long
m_seed
;
public:
TokenHashSet
(
const
unsigned
int
&
buckets
,
const
unsigned
long
&
seed
);
void
insert
(
const
Token
&
token
);
BucketIterator
begin
()
const
;
BucketIterator
end
()
const
;
};
/**
* Dictionary of all related Documents
* @see Dictionary
...
...
@@ -106,7 +87,7 @@ namespace utue::pe::tools::lsh {
* List of hash sets of Tokens. Each hash set was calculated by a different hash function. This results in
* different permutations of the tokens.
*/
std
::
vector
<
TokenHashSet
>
m_permutations
;
std
::
vector
<
Token
::
HashSet
>
m_permutations
;
public:
...
...
include/utue/pe/tools/lsh/NGram.h
View file @
a081d3cd
...
...
@@ -49,7 +49,7 @@
*
* @package utue.pe.tools.lsh
*
* @
todo add brief description
* @
brief Defines an NGRAM Token Scanner
*/
#ifndef UTUE_PE_TOOLS_LSH_NGRAM_H
...
...
@@ -59,17 +59,12 @@
namespace
utue
::
pe
::
tools
::
lsh
{
class
NGram
:
public
Token
{
public:
NGram
();
};
/**
*
*
Creates a NGRAM Token Scanner
* @param weight weight of the extracted ngram
* @param length length of the ngram
* @param leaks maximum length of allowed gaps
* @return
* @return
Pointer to the created Scanner object
*/
template
<
>
ScannerPtr
Scanner
::
create
<
Token
::
NGRAM
>
(
const
float
&
weight
,
int
length
,
int
leaks
);
...
...
include/utue/pe/tools/lsh/Scanner.h
View file @
a081d3cd
...
...
@@ -49,116 +49,70 @@
*
* @package utue.pe.tools.lsh
*
* @
todo add brief description
* @
brief Defines a abstract Scanner to extract tokens of a certain type from text.
*/
#ifndef UTUE_PE_TOOLS_LSH_
TOKEN
_H
#define UTUE_PE_TOOLS_LSH_
TOKEN
_H
#ifndef UTUE_PE_TOOLS_LSH_
SCANNER
_H
#define UTUE_PE_TOOLS_LSH_
SCANNER
_H
#include <functional>
#include <unordered_map>
#include <unordered_set>
#include <boost/shared_ptr.hpp>
#include <memory>
#include <utue/pe/tools/lsh/Token.h>
namespace
utue
::
pe
::
tools
::
lsh
{
/**
* @
todo add brief description
* @
brief Extracts Tokens from a given string
*/
class
Token
{
p
ublic
:
class
Scanner
{
p
rivate
:
/**
*
@todo add brief description
*
Weight of the extracted Tokens
*/
enum
Type
:
unsigned
char
{
TOKEN
=
0x00
,
WORD
=
0x10
,
NGRAM
=
0x11
,
INTEGER
=
0x20
,
FLOAT
=
0x30
};
float
m_weight
;
public:
/**
* @todo add brief description
* Creates a new Scanner object
* @param weight Weight of the extracted Tokens
*/
class
Hash
{
public:
unsigned
long
operator
()(
const
Token
&
key
)
const
;
};
explicit
Scanner
(
const
float
&
weight
=
1.0
f
);
/**
* @todo add brief description
* Returns the weight the scanner assigns to the tokens.
* @return Weight of the Tokens
*/
class
Equal
{
public:
bool
operator
()(
const
Token
&
left
,
const
Token
&
right
)
const
;
};
[[
nodiscard
]]
const
float
&
weight
()
const
;
/**
* @todo add brief description
* Set the weight which is set to the Tokens extracted by the Scanner
* @param Weight of the Tokens
*/
class
Scanner
{
private:
float
m_weight
;
public:
explicit
Scanner
(
const
float
&
weight
=
1.0
f
);
[[
nodiscard
]]
const
float
&
weight
()
const
;
void
weight
(
const
float
&
weight
);
virtual
void
scan
(
const
std
::
string
&
text
,
std
::
function
<
void
(
const
Token
&
)
>
callback
)
=
0
;
template
<
Type
TokenType
,
typename
...
args
>
static
std
::
shared_ptr
<
Scanner
>
create
(
const
float
&
weight
,
args
...);
};
typedef
std
::
shared_ptr
<
Scanner
>
ScannerPtr
;
template
<
class
ValueType
>
using
HashMap
=
std
::
unordered_map
<
Token
,
ValueType
,
Token
::
Hash
,
Token
::
Equal
>
;
using
HashSet
=
std
::
unordered_set
<
Token
,
Token
::
Hash
,
Token
::
Equal
>
;
private:
Type
m_type
;
float
m_weight
;
char
*
m_value
;
unsigned
int
m_length
;
public:
Token
();
Token
(
const
Token
&
token
);
Token
(
const
Type
&
type
,
const
void
*
value
,
const
unsigned
int
&
length
,
const
float
&
weight
=
1.0
f
);
virtual
~
Token
();
[[
nodiscard
]]
Type
&
type
();
[[
nodiscard
]]
const
Type
&
type
()
const
;
void
type
(
const
Type
&
type
);
[[
nodiscard
]]
float
&
weight
();
[[
nodiscard
]]
const
float
&
weight
()
const
;
void
weight
(
const
float
&
weight
);
[[
nodiscard
]]
void
*
value
();
[[
nodiscard
]]
const
void
*
value
()
const
;
[[
nodiscard
]]
unsigned
int
&
length
();
[[
nodiscard
]]
const
unsigned
int
&
length
()
const
;
void
value
(
const
void
*
value
,
const
unsigned
int
&
length
);
[[
nodiscard
]]
unsigned
long
hash
()
const
;
[[
nodiscard
]]
unsigned
long
hash
(
const
unsigned
long
&
seed
)
const
;
[[
nodiscard
]]
bool
equal
(
const
Token
&
token
)
const
;
/**
* Extract the Tokens of a text
* @param text Text to scan
* @param callback Callback function gets called for each extracted Token
*/
virtual
void
scan
(
const
std
::
string
&
text
,
std
::
function
<
void
(
const
Token
&
)
>
callback
)
=
0
;
[[
nodiscard
]]
std
::
string
toString
()
const
;
/**
* Creates a Scanner for a specific Token type.
* @tparam TokenType Type of the Token to be extracted by the created Scanner.
* @tparam args Argument types.
* @param weight Weight of the Tokens extracted by the Scanner.
* @param ... Additional arguments to create the Scanner object
* @return Pointer to the created Scanner object
*/
template
<
Token
::
Type
TokenType
,
typename
...
args
>
static
std
::
shared_ptr
<
Scanner
>
create
(
const
float
&
weight
,
args
...);
};
/**
* Pointer to a Scanner object
*/
typedef
std
::
shared_ptr
<
Scanner
>
ScannerPtr
;
}
#endif //UTUE_PE_TOOLS_LSH_
TOKEN
_H
#endif //UTUE_PE_TOOLS_LSH_
SCANNER
_H
include/utue/pe/tools/lsh/Token.h
View file @
a081d3cd
...
...
@@ -49,88 +49,223 @@
*
* @package utue.pe.tools.lsh
*
* @
todo add brief description
* @
brief Defines a token object. A token object represents a semantic unit of a text.
*/
#ifndef UTUE_PE_TOOLS_LSH_TOKEN_H
#define UTUE_PE_TOOLS_LSH_TOKEN_H
#include <u
nordered_map
>
#include <u
tue/pe/utils/HashMap.h
>
namespace
utue
::
pe
::
tools
::
lsh
{
/**
*
@todo add brief description
*
A Token represents any kind of semantic meaning in a string
*/
class
Token
{
public:
/**
* @
todo add brief description
* @
brief Types of different Tokens
*/
enum
Type
:
unsigned
char
{
/**
* Generic type for an unspecified Token
*/
TOKEN
=
0x00
,
/**
* A token representing a word
*/
WORD
=
0x10
,
/**
* A token extracted from a string using a fixed window size.
*/
NGRAM
=
0x11
,
/**
* Not implemented
*/
INTEGER
=
0x20
,
/**
* Not implemented
*/
FLOAT
=
0x30
};
/**
* @
todo add brief description
* @
brief Provides a hash function for std::unordered_map
*/
class
Hash
{
public:
/**
* Calculates the hash of a Token
* @param key Token to calculate hash
* @return Hash of the token
*/
unsigned
long
operator
()(
const
Token
&
key
)
const
;
};
/**
* @
todo add brief description
* @
brief Provides a comparator for std::unordered_map
*/
class
Equal
{
public:
/**
* Compares two Tokens without using the hash
* @param left First Token
* @param right Second Token
* @return True if both objects are the same
*/
bool
operator
()(
const
Token
&
left
,
const
Token
&
right
)
const
;
};
/**
* HashMap with a Token key
*/
template
<
class
ValueType
>
using
HashMap
=
std
::
unordered_map
<
Token
,
ValueType
,
Token
::
Hash
,
Token
::
Equal
>
;
using
HashMap
=
utils
::
HashMap
<
Token
,
ValueType
,
Token
::
Hash
,
Token
::
Equal
>
;
/**
* Hash Set with a Token key
*/
typedef
utils
::
HashSet
<
Token
,
Token
::
Hash
,
Token
::
Equal
>
HashSet
;
private:
/**
* Type of the Token
*/
Type
m_type
;
/**
* Weight of the Token
*/
float
m_weight
;
/**
* Pointer to the value of the Token
*/
char
*
m_value
;
/**
* Size of the value
*/
unsigned
int
m_length
;
public:
/**
* Creates an empty Token
* The type is initialized to TOKEN, the weight to 1.0f, and the value to null
*/
Token
();
/**
* Copy a token
* @param token to copy
*/
Token
(
const
Token
&
token
);
Token
(
const
Type
&
type
,
const
void
*
value
,
const
unsigned
int
&
length
,
const
float
&
weight
=
1.0
f
);
/**
* Create a full initialized Token
* @param type Type of the Token
* @param value Value of the Token
* @param length Size of the Value
* @param weight Weight od the Token
*/
Token
(
const
Type
&
type
,
const
char
*
value
,
const
unsigned
int
&
length
,
const
float
&
weight
=
1.0
f
);
/**
* Destroy the Token
*/
virtual
~
Token
();
/**
* Returns the Type of the Token
* @return Type of Token
*/
[[
nodiscard
]]
Type
&
type
();
/**
* Returns the Type of the Token
* @return Type of Token
*/
[[
nodiscard
]]
const
Type
&
type
()
const
;
/**
* Set the Type of the Token
* @param type Type of the Token
*/
void
type
(
const
Type
&
type
);
/**
* Returns the weight of the Token
* @return Weight of the Token
*/
[[
nodiscard
]]
float
&
weight
();
/**
* Returns the weight of the Token
* @return Weight of the Token
*/
[[
nodiscard
]]
const
float
&
weight
()
const
;
/**
* Set the weight of the Token
* @param weight Weight of the Token
*/
void
weight
(
const
float
&
weight
);
[[
nodiscard
]]
void
*
value
();
[[
nodiscard
]]
const
void
*
value
()
const
;
/**
* Returns the pointer to the Token value
* @return Value of the Token
*/
[[
nodiscard
]]
char
*
value
();
/**
* Returns the pointer to the Token value
* @return Value of the Token
*/
[[
nodiscard
]]
const
char
*
value
()
const
;
/**
* Returns the size of the Token value
* @return Size of the Token value
*/
[[
nodiscard
]]
unsigned
int
&
length
();
/**
* Returns the size of the Token value
* @return Size of the Token value
*/
[[
nodiscard
]]
const
unsigned
int
&
length
()
const
;
void
value
(
const
void
*
value
,
const
unsigned
int
&
length
);
/**
* Set the value of the Token
* @param value New value of the Token
* @param length Size of the new Token value