Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
David Geisler
pe-tools-lsh
Commits
9cc070a0
Commit
9cc070a0
authored
Nov 10, 2019
by
David Geisler
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
small changes
parent
a4c95c09
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
16 additions
and
16 deletions
+16
-16
include/utue/pe/tools/lsh/MinHash.h
include/utue/pe/tools/lsh/MinHash.h
+2
-2
modules/utils
modules/utils
+1
-1
source/utue/pe/tools/lsh/NGram.cpp
source/utue/pe/tools/lsh/NGram.cpp
+1
-1
source/utue/pe/tools/lsh/example/MinHash.cpp
source/utue/pe/tools/lsh/example/MinHash.cpp
+6
-6
source/utue/pe/tools/lsh/example/SubsMatch.cpp
source/utue/pe/tools/lsh/example/SubsMatch.cpp
+6
-6
No files found.
include/utue/pe/tools/lsh/MinHash.h
View file @
9cc070a0
...
...
@@ -191,7 +191,7 @@ namespace utue::pe::tools::lsh {
*/
Similarity
();
[[
nodiscard
]]
std
::
string
toString
(
const
unsigned
int
&
top
=
20
,
int
cw
=
10
)
const
;
[[
nodiscard
]]
std
::
string
toString
(
const
unsigned
int
&
top
=
5
,
int
cw
=
10
)
const
;
};
private:
...
...
@@ -231,7 +231,7 @@ namespace utue::pe::tools::lsh {
* @param hashes Number of different hash functions resulting in different permutations.
* @param buckets Number of buckets in the chained hash set of Tokens
*/
MinHash
(
const
unsigned
int
&
hashes
=
100
,
const
unsigned
int
&
buckets
=
500
);
explicit
MinHash
(
const
unsigned
int
&
hashes
=
100
,
const
unsigned
int
&
buckets
=
500
);
/**
* Creates a new document
...
...
utils
@
eff728e5
Compare
69f62046
...
eff728e5
Subproject commit
69f6204673e5f42acf45298d47ca55d7618b5a8c
Subproject commit
eff728e5b500b51ac3b4f9de99d1d852f70a9ad3
source/utue/pe/tools/lsh/NGram.cpp
View file @
9cc070a0
...
...
@@ -101,7 +101,7 @@ namespace utue::pe::tools::lsh {
};
template
<
>
ScannerPtr
Scanner
::
create
<
Token
::
NGRAM
>
(
const
float
&
weight
,
int
length
,
int
leaks
)
{
ScannerPtr
Scanner
::
create
<
Token
::
NGRAM
>
(
const
float
&
weight
,
unsigned
int
length
,
unsigned
int
leaks
)
{
return
ScannerPtr
(
new
NGramScanner
(
length
,
leaks
,
weight
));
}
}
\ No newline at end of file
source/utue/pe/tools/lsh/example/MinHash.cpp
View file @
9cc070a0
...
...
@@ -119,13 +119,13 @@ int main(int argc, const char** argv) {
minHash
=
MinHash
::
create
(
hashes
,
buckets
);
// add scanners to tokenize the strings into ngrams
minHash
->
createScanner
<
Token
::
NGRAM
>
(
1.0
f
,
3
,
0
);
// extracts 3-grams and weight them by 1.0
minHash
->
createScanner
<
Token
::
NGRAM
>
(
1.0
f
,
4
,
1
);
// 4-grams but with one leak
minHash
->
createScanner
<
Token
::
NGRAM
>
(
1.0
f
,
5
,
2
);
// 5-grams but with two leaks
minHash
->
createScanner
<
Token
::
NGRAM
>
(
1.0
f
,
3
u
,
0
u
);
// extracts 3-grams and weight them by 1.0
minHash
->
createScanner
<
Token
::
NGRAM
>
(
1.0
f
,
4
u
,
1
u
);
// 4-grams but with one leak
minHash
->
createScanner
<
Token
::
NGRAM
>
(
1.0
f
,
5
u
,
2
u
);
// 5-grams but with two leaks
minHash
->
createScanner
<
Token
::
NGRAM
>
(
1.5
f
,
4
,
0
);
// extracts 4-grams and weight them by 2.0
minHash
->
createScanner
<
Token
::
NGRAM
>
(
1.5
f
,
5
,
1
);
// 5-grams but with one leak
minHash
->
createScanner
<
Token
::
NGRAM
>
(
1.5
f
,
6
,
2
);
// 6-grams but with two leaks
minHash
->
createScanner
<
Token
::
NGRAM
>
(
1.5
f
,
4
u
,
0
u
);
// extracts 4-grams and weight them by 2.0
minHash
->
createScanner
<
Token
::
NGRAM
>
(
1.5
f
,
5
u
,
1
u
);
// 5-grams but with one leak
minHash
->
createScanner
<
Token
::
NGRAM
>
(
1.5
f
,
6
u
,
2
u
);
// 6-grams but with two leaks
//
minHash
->
createScanner
<
Token
::
WORD
>
(
2.0
f
);
...
...
source/utue/pe/tools/lsh/example/SubsMatch.cpp
View file @
9cc070a0
...
...
@@ -101,13 +101,13 @@ int main(int argc, const char** argv) {
subsMatch
=
SubsMatch
::
create
();
// add scanners to tokenize the strings into ngrams
subsMatch
->
createScanner
<
Token
::
NGRAM
>
(
1.0
f
,
3
,
0
);
// extracts 3-grams and weight them by 1.0
subsMatch
->
createScanner
<
Token
::
NGRAM
>
(
1.0
f
,
4
,
1
);
// 4-grams but with one leak
subsMatch
->
createScanner
<
Token
::
NGRAM
>
(
1.0
f
,
5
,
2
);
// 5-grams but with two leaks
subsMatch
->
createScanner
<
Token
::
NGRAM
>
(
1.0
f
,
3
u
,
0
u
);
// extracts 3-grams and weight them by 1.0
subsMatch
->
createScanner
<
Token
::
NGRAM
>
(
1.0
f
,
4
u
,
1
u
);
// 4-grams but with one leak
subsMatch
->
createScanner
<
Token
::
NGRAM
>
(
1.0
f
,
5
u
,
2
u
);
// 5-grams but with two leaks
subsMatch
->
createScanner
<
Token
::
NGRAM
>
(
1.5
f
,
4
,
0
);
// extracts 4-grams and weight them by 2.0
subsMatch
->
createScanner
<
Token
::
NGRAM
>
(
1.5
f
,
5
,
1
);
// 5-grams but with one leak
subsMatch
->
createScanner
<
Token
::
NGRAM
>
(
1.5
f
,
6
,
2
);
// 6-grams but with two leaks
subsMatch
->
createScanner
<
Token
::
NGRAM
>
(
1.5
f
,
4
u
,
0
u
);
// extracts 4-grams and weight them by 2.0
subsMatch
->
createScanner
<
Token
::
NGRAM
>
(
1.5
f
,
5
u
,
1
u
);
// 5-grams but with one leak
subsMatch
->
createScanner
<
Token
::
NGRAM
>
(
1.5
f
,
6
u
,
2
u
);
// 6-grams but with two leaks
//
subsMatch
->
createScanner
<
Token
::
WORD
>
(
2.0
f
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment