Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
David Geisler
pe-tools-lsh
Commits
c4068fce
Commit
c4068fce
authored
Nov 02, 2019
by
David Geisler
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix subsmatch score scaling. add example for subsmatch
parent
62a7ddc3
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
168 additions
and
11 deletions
+168
-11
include/utue/pe/tools/lsh/Dictionary.h
include/utue/pe/tools/lsh/Dictionary.h
+7
-0
source/utue/pe/tools/lsh/Dictionary.cpp
source/utue/pe/tools/lsh/Dictionary.cpp
+4
-0
source/utue/pe/tools/lsh/SubsMatch.cpp
source/utue/pe/tools/lsh/SubsMatch.cpp
+14
-9
source/utue/pe/tools/lsh/example/CMakeLists.txt
source/utue/pe/tools/lsh/example/CMakeLists.txt
+4
-2
source/utue/pe/tools/lsh/example/MinHash.cpp
source/utue/pe/tools/lsh/example/MinHash.cpp
+4
-0
source/utue/pe/tools/lsh/example/SubsMatch.cpp
source/utue/pe/tools/lsh/example/SubsMatch.cpp
+135
-0
No files found.
include/utue/pe/tools/lsh/Dictionary.h
View file @
c4068fce
...
...
@@ -126,6 +126,13 @@ namespace utue::pe::tools::lsh {
* @return relative frequency
*/
[[
nodiscard
]]
float
frequency
(
const
Token
&
token
)
const
;
/**
* Returns the number of occurrences of a token in the documents
* @param token Requested token
* @return number of occurrences
*/
[[
nodiscard
]]
unsigned
int
count
(
const
Token
&
token
)
const
;
};
/**
...
...
source/utue/pe/tools/lsh/Dictionary.cpp
View file @
c4068fce
...
...
@@ -86,4 +86,8 @@ namespace utue::pe::tools::lsh {
float
Dictionary
::
frequency
(
const
Token
&
token
)
const
{
return
float
(
this
->
m_tokens
(
token
,
0
))
/
float
(
this
->
m_count
);
}
unsigned
int
Dictionary
::
count
(
const
Token
&
token
)
const
{
return
this
->
m_tokens
(
token
,
0
);
}
}
\ No newline at end of file
source/utue/pe/tools/lsh/SubsMatch.cpp
View file @
c4068fce
...
...
@@ -124,8 +124,9 @@ namespace utue::pe::tools::lsh {
void
SubsMatch
::
similarity
(
Similarity
&
similarity
)
{
utils
::
Tic
tic
;
float
leftFreq
;
float
rightFreq
;
unsigned
int
leftFreq
;
unsigned
int
rightFreq
;
unsigned
int
dictFreq
;
float
diffFreq
;
float
weight
;
float
sqSum
;
...
...
@@ -136,18 +137,22 @@ namespace utue::pe::tools::lsh {
for
(
const
auto
&
bucket
:
this
->
m_dictionary
->
tokens
().
fastIterator
())
{
for
(
const
std
::
pair
<
Token
,
unsigned
int
>&
token
:
*
bucket
)
{
leftFreq
=
similarity
.
m_left
->
frequency
(
token
.
first
);
rightFreq
=
similarity
.
m_right
->
frequency
(
token
.
first
);
if
(
leftFreq
==
0
.0
f
&&
rightFreq
==
0
.0
f
)
leftFreq
=
similarity
.
m_left
->
count
(
token
.
first
);
rightFreq
=
similarity
.
m_right
->
count
(
token
.
first
);
if
(
leftFreq
==
0
&&
rightFreq
==
0
)
continue
;
dictFreq
=
this
->
m_dictionary
->
count
(
token
.
first
);
weight
=
token
.
first
.
weight
();
diffFreq
=
leftFreq
-
rightFreq
;
sqSum
+=
weight
*
diffFreq
*
diffFreq
;
mxSum
+=
mxSum
;
diffFreq
=
(
float
(
leftFreq
)
-
float
(
rightFreq
))
/
float
(
dictFreq
)
;
sqSum
+=
weight
*
powf
(
diffFreq
,
2.0
f
)
;
mxSum
+=
weight
;
}
}
similarity
.
m_similarity
=
sqrtf
(
mxSum
)
-
sqrtf
(
sqSum
);
if
(
mxSum
>
0.0
f
)
similarity
.
m_similarity
=
1.0
f
-
sqrtf
(
sqSum
/
mxSum
);
else
similarity
.
m_similarity
=
0.0
f
;
similarity
.
m_seconds
=
utils
::
Toc
(
tic
).
seconds
();
}
...
...
source/utue/pe/tools/lsh/example/CMakeLists.txt
View file @
c4068fce
...
...
@@ -42,6 +42,8 @@
# email: david.geisler@uni-tuebingen.de #
############################################################################
#add_library(pe-tools-lsh STATIC Document.cpp;Token.cpp;NGram.cpp;Word.cpp)
add_executable
(
pe-tools-lsh-example-minhash MinHash.cpp
)
target_link_libraries
(
pe-tools-lsh-example-minhash pe-tools-lsh;pe-utils
)
\ No newline at end of file
target_link_libraries
(
pe-tools-lsh-example-minhash pe-tools-lsh;pe-utils
)
add_executable
(
pe-tools-lsh-example-subsmatch SubsMatch.cpp
)
target_link_libraries
(
pe-tools-lsh-example-subsmatch pe-tools-lsh;pe-utils
)
\ No newline at end of file
source/utue/pe/tools/lsh/example/MinHash.cpp
View file @
c4068fce
...
...
@@ -83,6 +83,10 @@ int main(int argc, const char** argv) {
// two complete different string
//doc[0] = std::string("t3E4 VurB m6qu VsTP");
//doc[1] = std::string("8PZW rADb g5oG X4rp");
// two exact same strings
//doc[0] = std::string("hello world!");
//doc[1] = std::string("hello world!");
break
;
case
3
:
// two arguments given
doc
[
0
]
=
std
::
string
(
argv
[
1
]);
...
...
source/utue/pe/tools/lsh/example/SubsMatch.cpp
0 → 100644
View file @
c4068fce
/****************************************************************************
* Copyright (C) 2019 by David Geisler <david.geisler@uni-tuebingen.de> *
* *
* This file is part of the Perception Engineering Toolbox 1.0. *
* Redistribution and use in source and binary forms, with or without *
* modification, are permitted provided that the following conditions *
* are met: *
* *
* 1. Redistributions of source code must retain the above copyright *
* notice, this list of conditions and the following disclaimer. *
* 2. Redistributions in binary form must reproduce the above copyright *
* notice, this list of conditions and the following disclaimer in the *
* documentation and/or other materials provided with the distribution. *
* 3. Neither the name of the copyright holder nor the names of its *
* contributors may be used to endorse or promote products derived from *
* this software without specific prior written permission. *
* *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS *
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT *
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A *
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT *
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, *
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED *
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR *
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
* *
* Correspondence should be directed to *
* Eberhard Karls Universität Tübingen: *
* *
* Eberhard Karls Universität Tübingen *
* Mathematisch-Naturwissenschaftliche Fakultät *
* Technische Informatik - Perception Engineering *
* David Geisler *
* Sand 14 *
* D-72076 Tübingen *
* GERMANY *
* www.uni-tuebingen.de/en *
* *
* email: david.geisler@uni-tuebingen.de *
****************************************************************************/
/**
* @author David Geisler
* @copyright Copyright (C) 2019 by David Geisler <david.geisler@uni-tuebingen.de>
* @date 22 Oct 2019
*
* @package utue.pe.tools.lsh.example
*
* @todo add brief description
*/
#include <utue/pe/tools/lsh/SubsMatch.h>
#include <utue/pe/utils/TicToc.h>
#include <iostream>
#include <iomanip>
using
namespace
utue
::
pe
::
tools
::
lsh
;
using
namespace
utue
::
pe
::
utils
;
int
main
(
int
argc
,
const
char
**
argv
)
{
SubsMatchPtr
subsMatch
;
SubsMatch
::
Similarity
similarity
;
std
::
string
doc
[
2
];
Tic
tic
;
Toc
toc
;
switch
(
argc
)
{
case
1
:
// no arguments given -> use example sentences
//doc[0] = std::string("The quick brown fox jumps over the lazy dog.");
//doc[1] = std::string("The slow black cow jumps over the lazy monkey.");
// permutated string
//doc[0] = std::string("t3E4 VurB m6qu VsTP 8PZW rADb g5oG X4rp l76L");
//doc[1] = std::string("8PZW rADb g5oG X4rp t3E4 VurB m6qu VsTP vVOq");
// two complete different string
//doc[0] = std::string("t3E4 VurB m6qu VsTP");
//doc[1] = std::string("8PZW rADb g5oG X4rp");
// two exact same strings
//doc[0] = std::string("hello world!");
//doc[1] = std::string("hello world!");
break
;
case
3
:
// two arguments given
doc
[
0
]
=
std
::
string
(
argv
[
1
]);
doc
[
1
]
=
std
::
string
(
argv
[
2
]);
break
;
default:
// we expect either no arguments or two arguments
std
::
cerr
<<
"Invalid number of arguments. Call the program without arguments to calculate the"
<<
"similarity of two example strings. Or call the program with two strings as argument "
<<
"to compare calculate the similarity of them:"
<<
std
::
endl
<<
" "
<<
argv
[
0
]
<<
" <string0> <string1>"
<<
std
::
endl
;
exit
(
EXIT_FAILURE
);
// fail
}
// create a new SubsMatch instance
subsMatch
=
SubsMatch
::
create
();
// add scanners to tokenize the strings into ngrams
subsMatch
->
createScanner
<
Token
::
NGRAM
>
(
1.0
f
,
3
,
0
);
// extracts 3-grams and weight them by 1.0
subsMatch
->
createScanner
<
Token
::
NGRAM
>
(
1.0
f
,
4
,
1
);
// 4-grams but with one leak
subsMatch
->
createScanner
<
Token
::
NGRAM
>
(
1.0
f
,
5
,
2
);
// 5-grams but with two leaks
subsMatch
->
createScanner
<
Token
::
NGRAM
>
(
1.5
f
,
4
,
0
);
// extracts 4-grams and weight them by 2.0
subsMatch
->
createScanner
<
Token
::
NGRAM
>
(
1.5
f
,
5
,
1
);
// 5-grams but with one leak
subsMatch
->
createScanner
<
Token
::
NGRAM
>
(
1.5
f
,
6
,
2
);
// 6-grams but with two leaks
//
subsMatch
->
createScanner
<
Token
::
WORD
>
(
2.0
f
);
// print what happen
std
::
cout
<<
"Calculate similarity:"
<<
std
::
endl
;
std
::
cout
<<
" string0:
\"
"
<<
doc
[
0
]
<<
"
\"
"
<<
std
::
endl
;
std
::
cout
<<
" string1:
\"
"
<<
doc
[
1
]
<<
"
\"
"
<<
std
::
endl
;
tic
=
Tic
();
// initialize time measurement
// scan the text and calculate the similarity of the extracted token frequencies
similarity
=
subsMatch
->
similarity
(
doc
[
0
],
doc
[
1
]);
toc
=
Toc
(
tic
);
// stop time measurement
// print the number of extracted tokens
std
::
cout
<<
" dictionary: "
<<
subsMatch
->
dictionary
()
->
count
()
<<
std
::
endl
;
// print the over all runtime (including string scanning)
std
::
cout
<<
" seconds: "
<<
toc
.
seconds
()
<<
std
::
endl
;
// print the top 20 tokens with the highest impact to the scoring
std
::
cout
<<
std
::
endl
<<
similarity
.
toString
()
<<
std
::
endl
;
return
EXIT_SUCCESS
;
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment