Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
David Geisler
pe-tools-lsh
Commits
e2849028
Commit
e2849028
authored
Oct 24, 2019
by
David Geisler
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
small improvements
parent
b569a6a4
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
541 additions
and
124 deletions
+541
-124
include/utue/pe/tools/lsh/MinHash.h
include/utue/pe/tools/lsh/MinHash.h
+164
-5
include/utue/pe/tools/lsh/Scanner.h
include/utue/pe/tools/lsh/Scanner.h
+1
-1
include/utue/pe/tools/lsh/Token.h
include/utue/pe/tools/lsh/Token.h
+4
-8
include/utue/pe/utils/HashSet.h
include/utue/pe/utils/HashSet.h
+51
-6
include/utue/pe/utils/TicToc.h
include/utue/pe/utils/TicToc.h
+1
-1
source/utue/pe/tools/lsh/Dictionary.cpp
source/utue/pe/tools/lsh/Dictionary.cpp
+1
-1
source/utue/pe/tools/lsh/MinHash.cpp
source/utue/pe/tools/lsh/MinHash.cpp
+235
-51
source/utue/pe/tools/lsh/NGram.cpp
source/utue/pe/tools/lsh/NGram.cpp
+11
-13
source/utue/pe/tools/lsh/Token.cpp
source/utue/pe/tools/lsh/Token.cpp
+28
-20
source/utue/pe/tools/lsh/example/MinHash.cpp
source/utue/pe/tools/lsh/example/MinHash.cpp
+45
-18
No files found.
include/utue/pe/tools/lsh/MinHash.h
View file @
e2849028
...
...
@@ -70,7 +70,132 @@ namespace utue::pe::tools::lsh {
* @brief Implements a MinHash algorithm to estimate the similarity of multiple Documents
*/
class
MinHash
{
public:
/**
* @brief Stores the result of one MinHash similarity calculation
*/
class
Similarity
{
public:
/**
* @brief Represents a report entry
*/
class
ReportEntry
{
public:
/**
* Investigated token
*/
Token
m_token
;
/**
* Frequency of the Token in the left Document
*/
float
m_leftFreq
;
/**
* Frequency of the Token in the right Document
*/
float
m_rightFreq
;
/**
* Frequency of the Token in the Dictionary
*/
float
m_dictFreq
;
/**
* Calculated weight of the Token
*/
float
m_weight
;
/**
* Fractal count of matches while thresholding
*/
float
m_matches
;
/**
* Constructs an empty ReportEntry
*/
ReportEntry
();
};
/**
* The left Document
*/
DocumentPtr
m_left
;
/**
* The right Document
*/
DocumentPtr
m_right
;
/**
* Number of threshold to break down the Token frequencies to a boolean space
*/
unsigned
int
m_thresholds
;
/**
* Random number generator seed
*/
unsigned
int
m_seed
;
/**
* Sum of weights of all Token matches
*/
float
m_score
;
/**
* Sum of all weights
*/
float
m_count
;
/**
* Similarity score
*/
float
m_similarity
;
/**
* Number of hashes/permutations
*/
unsigned
int
m_hashes
{};
/**
* Number of buckets per hash table
*/
unsigned
int
m_buckets
;
/**
* Processing time in seconds
*/
float
m_seconds
;
/**
* Report of all investigated Token matches
*/
std
::
list
<
ReportEntry
>
m_report
;
/**
* Creates a new Similarity object
* @param left Left Document
* @param right Right Document
* @param thresholds Number of thresholds
* @param seed Random number generator seed
* @param hashes Number of hashes/permutations
* @param buckets Number of buckets in hash tables
*/
Similarity
(
DocumentPtr
left
,
DocumentPtr
right
,
const
unsigned
int
&
thresholds
,
const
unsigned
int
&
seed
,
const
unsigned
int
&
hashes
,
const
unsigned
int
&
buckets
);
/**
* Creates an empty Similarity object
*/
Similarity
();
[[
nodiscard
]]
std
::
string
toString
(
const
unsigned
int
&
top
=
20
,
int
cw
=
10
)
const
;
};
private:
/**
* Dictionary of all related Documents
* @see Dictionary
...
...
@@ -89,6 +214,16 @@ namespace utue::pe::tools::lsh {
*/
std
::
vector
<
Token
::
HashSet
>
m_permutations
;
/**
* Number of hashes/permutations
*/
unsigned
int
m_hashes
;
/**
* Number of buckets in each hash table
*/
unsigned
int
m_buckets
;
public:
/**
...
...
@@ -102,14 +237,14 @@ namespace utue::pe::tools::lsh {
* Creates a new document
* @return document Empty document linked to the MinHash Dictionary and Scanner
*/
DocumentPtr
createDocument
();
[[
nodiscard
]]
DocumentPtr
createDocument
();
/**
* Creates a new document
* @param text content of the document
* @return Document with initialized contend linked to the MinHash Dictionary and Scanner
*/
DocumentPtr
createDocument
(
const
std
::
string
&
text
);
[[
nodiscard
]]
DocumentPtr
createDocument
(
const
std
::
string
&
text
);
/**
* creates a new token scanner
...
...
@@ -136,6 +271,12 @@ namespace utue::pe::tools::lsh {
*/
void
add
(
const
Token
&
token
);
/**
* Calculates the similarity of to Documents
* @param similarity Read/Write container with two documents
*/
void
similarity
(
Similarity
&
similarity
);
/**
* Calculates the similarity of to Documents
* @param left The first Document
...
...
@@ -144,7 +285,7 @@ namespace utue::pe::tools::lsh {
* @param seed Random number generator seed
* @return Similarity of two Documents
*/
float
similarity
(
const
DocumentPtr
&
left
,
const
DocumentPtr
&
right
,
const
unsigned
int
&
thresholds
=
10
,
const
unsigned
int
&
seed
=
0
);
[[
nodiscard
]]
Similarity
similarity
(
const
DocumentPtr
&
left
,
const
DocumentPtr
&
right
,
const
unsigned
int
&
thresholds
=
10
,
const
unsigned
int
&
seed
=
0
);
/**
* Calculates the similarity of to string
...
...
@@ -154,7 +295,7 @@ namespace utue::pe::tools::lsh {
* @param seed Random number generator seed
* @return Similarity of two strings
*/
float
similarity
(
const
std
::
string
&
left
,
const
std
::
string
&
right
,
const
unsigned
int
&
thresholds
=
10
,
const
unsigned
int
&
seed
=
0
);
[[
nodiscard
]]
Similarity
similarity
(
const
std
::
string
&
left
,
const
std
::
string
&
right
,
const
unsigned
int
&
thresholds
=
10
,
const
unsigned
int
&
seed
=
0
);
/**
* Add a Scanner to the MinHash
...
...
@@ -162,13 +303,31 @@ namespace utue::pe::tools::lsh {
*/
void
add
(
const
ScannerPtr
&
scanner
);
/**
* Returns the dictionary
* @return Dictionary
*/
[[
nodiscard
]]
const
DictionaryPtr
&
dictionary
()
const
;
/**
* Returns the maximal bucket load
* @return Bucket load
*/
[[
nodiscard
]]
unsigned
long
maxLoad
()
const
;
/**
* Returns the average bucket load
* @return Bucket load
*/
[[
nodiscard
]]
float
avgLoad
()
const
;
/**
* Creates a MinHash object
* @param hashes Number of different hash functions resulting in different permutations.
* @param buckets Number of buckets in the chained hash set of Tokens
* @return Pointer to the created MinHash object
*/
static
std
::
shared_ptr
<
MinHash
>
create
(
const
unsigned
int
&
hashes
=
1000
,
const
unsigned
int
&
buckets
=
500
);
[[
nodiscard
]]
static
std
::
shared_ptr
<
MinHash
>
create
(
const
unsigned
int
&
hashes
=
1000
,
const
unsigned
int
&
buckets
=
500
);
};
/**
...
...
include/utue/pe/tools/lsh/Scanner.h
View file @
e2849028
...
...
@@ -55,9 +55,9 @@
#ifndef UTUE_PE_TOOLS_LSH_SCANNER_H
#define UTUE_PE_TOOLS_LSH_SCANNER_H
#include <utue/pe/tools/lsh/Token.h>
#include <functional>
#include <memory>
#include <utue/pe/tools/lsh/Token.h>
namespace
utue
::
pe
::
tools
::
lsh
{
...
...
include/utue/pe/tools/lsh/Token.h
View file @
e2849028
...
...
@@ -147,9 +147,9 @@ namespace utue::pe::tools::lsh {
float
m_weight
;
/**
*
Pointer to the v
alue of the Token
*
V
alue of the Token
*/
char
*
m_value
;
char
m_value
[
32
]
;
/**
* Size of the value
...
...
@@ -164,12 +164,6 @@ namespace utue::pe::tools::lsh {
*/
Token
();
/**
* Copy a token
* @param token to copy
*/
Token
(
const
Token
&
token
);
/**
* Create a full initialized Token
* @param type Type of the Token
...
...
@@ -267,6 +261,8 @@ namespace utue::pe::tools::lsh {
* @return String showing the Token variables
*/
[[
nodiscard
]]
std
::
string
toString
()
const
;
static
std
::
string
typeToString
(
const
Type
&
type
);
};
}
...
...
include/utue/pe/utils/HashSet.h
View file @
e2849028
...
...
@@ -168,9 +168,11 @@ namespace utue::pe::utils {
HashSet
(
const
unsigned
int
&
buckets
=
100
,
const
unsigned
long
&
seed
=
0
)
:
m_hash
(),
m_equal
(),
m_buckets
(
buckets
),
m_buckets
(),
m_active_buckets
(),
m_seed
(
seed
)
{
}
m_seed
(
seed
)
{
this
->
m_buckets
.
resize
(
buckets
);
}
/**
* Inserts an element to the HashSet
...
...
@@ -192,12 +194,24 @@ namespace utue::pe::utils {
return
this
->
m_buckets
[
i
].
emplace_back
(
element
);
}
[[
nodiscard
]]
bool
has
(
const
T
&
element
)
{
unsigned
long
i
;
i
=
this
->
hash
(
element
);
for
(
auto
&
it
:
this
->
m_buckets
[
i
])
if
(
this
->
m_equal
(
it
,
element
))
return
true
;
return
false
;
}
/**
* Returns a iterator container for fast bucket iteration.
* Only active buckets will be considered.
* @return Iterator container
*/
Iterator
<
FastBucketIterator
>
fastIterator
()
{
[[
nodiscard
]]
Iterator
<
FastBucketIterator
>
fastIterator
()
{
return
Iterator
<
FastBucketIterator
>
(
this
->
m_active_buckets
.
begin
(),
this
->
m_active_buckets
.
end
());
}
...
...
@@ -206,7 +220,7 @@ namespace utue::pe::utils {
* All buckets will be considered, if active or not.
* @return Iterator container
*/
Iterator
<
SlowBucketIterator
>
slowIterator
()
{
[[
nodiscard
]]
Iterator
<
SlowBucketIterator
>
slowIterator
()
{
return
Iterator
<
SlowBucketIterator
>
(
this
->
m_buckets
.
begin
(),
this
->
m_buckets
.
end
());
}
...
...
@@ -215,7 +229,7 @@ namespace utue::pe::utils {
* Only active buckets will be considered.
* @return Iterator container
*/
Iterator
<
FastConstBucketIterator
>
fastIterator
()
const
{
[[
nodiscard
]]
Iterator
<
FastConstBucketIterator
>
fastIterator
()
const
{
return
Iterator
<
FastConstBucketIterator
>
(
this
->
m_active_buckets
.
begin
(),
this
->
m_active_buckets
.
end
());
}
...
...
@@ -224,10 +238,41 @@ namespace utue::pe::utils {
* All buckets will be considered, if active or not.
* @return Iterator container
*/
Iterator
<
SlowConstBucketIterator
>
slowIterator
()
const
{
[[
nodiscard
]]
Iterator
<
SlowConstBucketIterator
>
slowIterator
()
const
{
return
Iterator
<
SlowConstBucketIterator
>
(
this
->
m_buckets
.
begin
(),
this
->
m_buckets
.
end
());
}
/**
* Returns the maximal bucket load
* @return Bucket load
*/
[[
nodiscard
]]
unsigned
long
maxLoad
()
const
{
unsigned
long
load
;
unsigned
long
tmp
;
load
=
0
;
for
(
const
auto
&
bucket
:
this
->
m_active_buckets
)
{
tmp
=
bucket
->
size
();
if
(
tmp
>
load
)
load
=
tmp
;
}
return
load
;
}
/**
* Returns the average bucket load
* @return Bucket load
*/
[[
nodiscard
]]
float
avgLoad
()
const
{
float
load
;
load
=
0.0
f
;
for
(
const
auto
&
bucket
:
this
->
m_active_buckets
)
load
+=
bucket
->
size
();
return
load
/
this
->
m_active_buckets
.
size
();
}
protected:
/**
...
...
include/utue/pe/utils/TicToc.h
View file @
e2849028
...
...
@@ -124,7 +124,7 @@ namespace utue::pe::utils {
* Creates a time measurement relative to a given Tic
* @param tic Start timestamp
*/
Toc
(
const
Tic
&
tic
);
explicit
Toc
(
const
Tic
&
tic
);
/**
* Creates a time measurement relative to the last Tic.
...
...
source/utue/pe/tools/lsh/Dictionary.cpp
View file @
e2849028
...
...
@@ -84,6 +84,6 @@ namespace utue::pe::tools::lsh {
}
float
Dictionary
::
frequency
(
const
Token
&
token
)
const
{
return
float
(
this
->
m_tokens
(
token
,
0
))
/
float
(
this
->
m_
max
);
return
float
(
this
->
m_tokens
(
token
,
0
))
/
float
(
this
->
m_
count
);
}
}
\ No newline at end of file
source/utue/pe/tools/lsh/MinHash.cpp
View file @
e2849028
...
...
@@ -58,10 +58,145 @@
#include <memory>
#include <cmath>
#include <random>
#include <utility>
#include <utue/pe/utils/TicToc.h>
#include <sstream>
#include <iomanip>
namespace
utue
::
pe
::
tools
::
lsh
{
MinHash
::
Similarity
::
ReportEntry
::
ReportEntry
()
:
m_token
(),
m_leftFreq
(
0.0
f
),
m_rightFreq
(
0.0
f
),
m_dictFreq
(
0.0
f
),
m_weight
(
0.0
f
),
m_matches
(
0.0
f
)
{};
MinHash
::
Similarity
::
Similarity
(
DocumentPtr
left
,
DocumentPtr
right
,
const
unsigned
int
&
thresholds
,
const
unsigned
int
&
seed
,
const
unsigned
int
&
hashes
,
const
unsigned
int
&
buckets
)
:
m_left
(
std
::
move
(
left
)),
m_right
(
std
::
move
(
right
)),
m_thresholds
(
thresholds
),
m_seed
(
seed
),
m_score
(
0.0
f
),
m_count
(
0.0
f
),
m_similarity
(
0.0
f
),
m_hashes
(
hashes
),
m_buckets
(
buckets
),
m_seconds
(
0.0
f
),
m_report
()
{
}
MinHash
::
Similarity
::
Similarity
()
:
m_left
(),
m_right
(),
m_thresholds
(
0
),
m_seed
(
0
),
m_score
(
0.0
f
),
m_count
(
0.0
f
),
m_similarity
(
0.0
f
),
m_hashes
(
0
),
m_buckets
(
0
),
m_seconds
(
0.0
f
),
m_report
()
{
}
std
::
string
MinHash
::
Similarity
::
toString
(
const
unsigned
int
&
top
,
int
cw
)
const
{
std
::
stringstream
ss
;
std
::
list
<
ReportEntry
>
report
;
unsigned
int
count
;
//report.assign(this->m_report.begin(),this->m_report.end());
for
(
const
auto
&
entry
:
this
->
m_report
)
report
.
emplace_back
(
ReportEntry
(
entry
));
ss
<<
" --------------- R E P O R T ---------------"
<<
std
::
endl
;
ss
<<
" Number of hashes: "
<<
this
->
m_hashes
<<
std
::
endl
;
ss
<<
" Number of buckets: "
<<
this
->
m_buckets
<<
std
::
endl
;
ss
<<
" Number of thresholds: "
<<
this
->
m_thresholds
<<
std
::
endl
;
ss
<<
" Seed: "
<<
this
->
m_seed
<<
std
::
endl
;
ss
<<
" Runtime: "
<<
this
->
m_seconds
<<
" seconds"
<<
std
::
endl
;
ss
<<
" Similarity: "
<<
this
->
m_similarity
<<
std
::
endl
;
ss
<<
" +"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
endl
;
ss
<<
" | "
<<
std
::
setw
(
cw
*
4
+
9
)
<<
std
::
setfill
(
' '
)
<<
"Token"
<<
" | "
<<
std
::
setw
(
cw
)
<<
std
::
setfill
(
' '
)
<<
"Left"
<<
" | "
<<
std
::
setw
(
cw
)
<<
std
::
setfill
(
' '
)
<<
"Right"
<<
" | "
<<
std
::
setw
(
cw
)
<<
std
::
setfill
(
' '
)
<<
"Dictionary"
<<
" | "
<<
std
::
setw
(
cw
*
3
+
6
)
<<
std
::
setfill
(
' '
)
<<
"MinHash"
<<
" | "
<<
std
::
endl
;
ss
<<
" +"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
endl
;
ss
<<
" | "
<<
std
::
setw
(
cw
)
<<
std
::
setfill
(
' '
)
<<
"Type"
<<
" | "
<<
std
::
setw
(
cw
)
<<
std
::
setfill
(
' '
)
<<
"Length"
<<
" | "
<<
std
::
setw
(
cw
)
<<
std
::
setfill
(
' '
)
<<
"Value"
<<
" | "
<<
std
::
setw
(
cw
)
<<
std
::
setfill
(
' '
)
<<
"Weight"
<<
" | "
<<
std
::
setw
(
cw
)
<<
std
::
setfill
(
' '
)
<<
"Frequency"
<<
" | "
<<
std
::
setw
(
cw
)
<<
std
::
setfill
(
' '
)
<<
"Frequency"
<<
" | "
<<
std
::
setw
(
cw
)
<<
std
::
setfill
(
' '
)
<<
"Frequency"
<<
" | "
<<
std
::
setw
(
cw
)
<<
std
::
setfill
(
' '
)
<<
"Matches"
<<
" | "
<<
std
::
setw
(
cw
)
<<
std
::
setfill
(
' '
)
<<
"Weight"
<<
" | "
<<
std
::
setw
(
cw
)
<<
std
::
setfill
(
' '
)
<<
"Score"
<<
" | "
<<
std
::
endl
;
ss
<<
" +"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
endl
;
report
.
sort
(
[](
const
ReportEntry
&
left
,
const
ReportEntry
&
right
)
->
bool
{
return
(
left
.
m_matches
*
left
.
m_weight
)
>
(
right
.
m_matches
*
right
.
m_weight
);
});
count
=
0
;
for
(
const
auto
&
entry
:
report
)
{
count
++
;
ss
<<
" | "
<<
std
::
setw
(
cw
+
0
)
<<
std
::
setfill
(
' '
)
<<
Token
::
typeToString
(
entry
.
m_token
.
type
())
<<
" | "
<<
std
::
setw
(
cw
+
0
)
<<
std
::
setfill
(
' '
)
<<
entry
.
m_token
.
length
()
<<
" | "
<<
std
::
setw
(
cw
+
0
)
<<
std
::
setfill
(
' '
)
<<
(
"
\"
"
+
std
::
string
(
entry
.
m_token
.
value
(),
entry
.
m_token
.
length
())
+
"
\"
"
)
<<
" | "
<<
std
::
setw
(
cw
+
0
)
<<
std
::
setfill
(
' '
)
<<
std
::
fixed
<<
std
::
setprecision
(
2
)
<<
entry
.
m_token
.
weight
()
<<
" | "
<<
std
::
setw
(
cw
-
1
)
<<
std
::
setfill
(
' '
)
<<
std
::
fixed
<<
std
::
setprecision
(
2
)
<<
100.0
f
*
entry
.
m_leftFreq
<<
"%"
<<
" | "
<<
std
::
setw
(
cw
-
1
)
<<
std
::
setfill
(
' '
)
<<
std
::
fixed
<<
std
::
setprecision
(
2
)
<<
100.0
f
*
entry
.
m_rightFreq
<<
"%"
<<
" | "
<<
std
::
setw
(
cw
-
1
)
<<
std
::
setfill
(
' '
)
<<
std
::
fixed
<<
std
::
setprecision
(
2
)
<<
100.0
f
*
entry
.
m_dictFreq
<<
"%"
<<
" | "
<<
std
::
setw
(
cw
-
1
)
<<
std
::
setfill
(
' '
)
<<
std
::
fixed
<<
std
::
setprecision
(
2
)
<<
100.0
f
*
entry
.
m_matches
<<
"%"
<<
" | "
<<
std
::
setw
(
cw
-
1
)
<<
std
::
setfill
(
' '
)
<<
std
::
fixed
<<
std
::
setprecision
(
2
)
<<
100.0
f
*
entry
.
m_weight
/
this
->
m_count
<<
"%"
<<
" | "
<<
std
::
setw
(
cw
-
1
)
<<
std
::
setfill
(
' '
)
<<
std
::
fixed
<<
std
::
setprecision
(
2
)
<<
100.0
f
*
entry
.
m_matches
*
entry
.
m_weight
/
this
->
m_count
<<
"%"
<<
" | "
<<
std
::
endl
;
if
(
count
==
top
)
break
;
}
ss
<<
" +"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
setw
(
cw
+
3
)
<<
std
::
setfill
(
'-'
)
<<
"+"
<<
std
::
endl
;
return
ss
.
str
();
}
class
MinHashDictionary
:
public
Dictionary
{
private:
MinHash
*
m_instance
;
...
...
@@ -94,9 +229,11 @@ namespace utue::pe::tools::lsh {
MinHash
::
MinHash
(
const
unsigned
int
&
hashes
,
const
unsigned
int
&
buckets
)
:
m_dictionary
(
new
MinHashDictionary
(
this
)),
m_scanner
(
new
MinHashScanner
()),
m_permutations
()
{
m_permutations
(),
m_hashes
(
hashes
),
m_buckets
(
buckets
)
{
for
(
unsigned
int
i
=
0
;
i
<
hashes
;
i
++
)
{
this
->
m_permutations
.
emplace_back
(
Token
::
HashSet
(
buckets
,
i
)
)
;
this
->
m_permutations
.
emplace_back
(
buckets
,
i
);
}
}
...
...
@@ -117,75 +254,96 @@ namespace utue::pe::tools::lsh {
permutation
.
insert
(
token
);
}
float
MinHash
::
similarity
(
const
DocumentPtr
&
left
,
const
DocumentPtr
&
right
,
const
unsigned
int
&
thresholds
,
const
unsigned
int
&
seed
)
{
float
leftFreq
;
float
rightFreq
;
float
dictFreq
;
void
MinHash
::
similarity
(
Similarity
&
similarity
)
{
Similarity
::
ReportEntry
entry
;
utils
::
Tic
tic
;
// Hyperplane to threshold the frequencies
float
thresh
;
float
weight
;
float
score
;
float
count
;
utils
::
Random
::
seed
(
seed
)
;
bool
done
;
count
=
0.0
f
;
s
core
=
0.0
f
;
// cleanup report
s
imilarity
.
m_report
.
clear
()
;
// set seed for random number generator
utils
::
Random
::
seed
(
similarity
.
m_seed
);
similarity
.
m_count
=
0.0
f
;
similarity
.
m_score
=
0.0
f
;
similarity
.
m_hashes
=
this
->
m_hashes
;
similarity
.
m_buckets
=
this
->
m_buckets
;
// Iterate over permutations
for
(
const
auto
&
permutation
:
this
->
m_permutations
)
{
// skip if permutation list is empty
done
=
false
;
// Iterate over buckets
for
(
const
auto
&
bucket
:
permutation
.
slowIterator
())
{
//Its important to use the slow iterator here!
//The fast iterator almost depicts the insertion
//order of the Tokens. The permutation is therefore
//weak, and earlier Tokens are higher weighted.
//
i
terate over tokens in permutation list
//The fast iterator almost depicts the insertion
//order of the Tokens. The permutation is therefore
//weak, and earlier Tokens are higher weighted.
//
I
terate over tokens in permutation list