Commit c4068fce authored by David Geisler's avatar David Geisler

fix subsmatch score scaling. add example for subsmatch

parent 62a7ddc3
......@@ -126,6 +126,13 @@ namespace utue::pe::tools::lsh {
* @return relative frequency
*/
[[nodiscard]] float frequency(const Token& token) const;
/**
* Returns the number of occurrences of a token in the documents
* @param token Requested token
* @return number of occurrences
*/
[[nodiscard]] unsigned int count(const Token& token) const;
};
/**
......
......@@ -86,4 +86,8 @@ namespace utue::pe::tools::lsh {
float Dictionary::frequency(const Token& token) const {
return float(this->m_tokens(token,0))/float(this->m_count);
}
unsigned int Dictionary::count(const Token& token) const {
return this->m_tokens(token,0);
}
}
\ No newline at end of file
......@@ -124,8 +124,9 @@ namespace utue::pe::tools::lsh {
void SubsMatch::similarity(Similarity& similarity) {
utils::Tic tic;
float leftFreq;
float rightFreq;
unsigned int leftFreq;
unsigned int rightFreq;
unsigned int dictFreq;
float diffFreq;
float weight;
float sqSum;
......@@ -136,18 +137,22 @@ namespace utue::pe::tools::lsh {
for(const auto& bucket : this->m_dictionary->tokens().fastIterator()) {
for(const std::pair<Token,unsigned int>& token : *bucket) {
leftFreq = similarity.m_left->frequency(token.first);
rightFreq = similarity.m_right->frequency(token.first);
if(leftFreq == 0.0f && rightFreq == 0.0f)
leftFreq = similarity.m_left->count(token.first);
rightFreq = similarity.m_right->count(token.first);
if(leftFreq == 0 && rightFreq == 0)
continue;
dictFreq = this->m_dictionary->count(token.first);
weight = token.first.weight();
diffFreq = leftFreq-rightFreq;
sqSum += weight*diffFreq*diffFreq;
mxSum += mxSum;
diffFreq = (float(leftFreq)-float(rightFreq))/float(dictFreq);
sqSum += weight*powf(diffFreq,2.0f);
mxSum += weight;
}
}
similarity.m_similarity = sqrtf(mxSum)-sqrtf(sqSum);
if(mxSum > 0.0f)
similarity.m_similarity = 1.0f-sqrtf(sqSum/mxSum);
else
similarity.m_similarity = 0.0f;
similarity.m_seconds = utils::Toc(tic).seconds();
}
......
......@@ -42,6 +42,8 @@
# email: david.geisler@uni-tuebingen.de #
############################################################################
#add_library(pe-tools-lsh STATIC Document.cpp;Token.cpp;NGram.cpp;Word.cpp)
add_executable(pe-tools-lsh-example-minhash MinHash.cpp)
target_link_libraries(pe-tools-lsh-example-minhash pe-tools-lsh;pe-utils)
\ No newline at end of file
target_link_libraries(pe-tools-lsh-example-minhash pe-tools-lsh;pe-utils)
add_executable(pe-tools-lsh-example-subsmatch SubsMatch.cpp)
target_link_libraries(pe-tools-lsh-example-subsmatch pe-tools-lsh;pe-utils)
\ No newline at end of file
......@@ -83,6 +83,10 @@ int main(int argc, const char** argv) {
// two complete different string
//doc[0] = std::string("t3E4 VurB m6qu VsTP");
//doc[1] = std::string("8PZW rADb g5oG X4rp");
// two exact same strings
//doc[0] = std::string("hello world!");
//doc[1] = std::string("hello world!");
break;
case 3: // two arguments given
doc[0] = std::string(argv[1]);
......
/****************************************************************************
* Copyright (C) 2019 by David Geisler <david.geisler@uni-tuebingen.de> *
* *
* This file is part of the Perception Engineering Toolbox 1.0. *
* Redistribution and use in source and binary forms, with or without *
* modification, are permitted provided that the following conditions *
* are met: *
* *
* 1. Redistributions of source code must retain the above copyright *
* notice, this list of conditions and the following disclaimer. *
* 2. Redistributions in binary form must reproduce the above copyright *
* notice, this list of conditions and the following disclaimer in the *
* documentation and/or other materials provided with the distribution. *
* 3. Neither the name of the copyright holder nor the names of its *
* contributors may be used to endorse or promote products derived from *
* this software without specific prior written permission. *
* *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS *
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT *
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A *
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT *
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, *
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED *
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR *
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
* *
* Correspondence should be directed to *
* Eberhard Karls Universität Tübingen: *
* *
* Eberhard Karls Universität Tübingen *
* Mathematisch-Naturwissenschaftliche Fakultät *
* Technische Informatik - Perception Engineering *
* David Geisler *
* Sand 14 *
* D-72076 Tübingen *
* GERMANY *
* www.uni-tuebingen.de/en *
* *
* email: david.geisler@uni-tuebingen.de *
****************************************************************************/
/**
* @author David Geisler
* @copyright Copyright (C) 2019 by David Geisler <david.geisler@uni-tuebingen.de>
* @date 22 Oct 2019
*
* @package utue.pe.tools.lsh.example
*
* @todo add brief description
*/
#include <utue/pe/tools/lsh/SubsMatch.h>
#include <utue/pe/utils/TicToc.h>
#include <iostream>
#include <iomanip>
using namespace utue::pe::tools::lsh;
using namespace utue::pe::utils;
int main(int argc, const char** argv) {
SubsMatchPtr subsMatch;
SubsMatch::Similarity similarity;
std::string doc[2];
Tic tic;
Toc toc;
switch(argc) {
case 1: // no arguments given -> use example sentences
//doc[0] = std::string("The quick brown fox jumps over the lazy dog.");
//doc[1] = std::string("The slow black cow jumps over the lazy monkey.");
// permutated string
//doc[0] = std::string("t3E4 VurB m6qu VsTP 8PZW rADb g5oG X4rp l76L");
//doc[1] = std::string("8PZW rADb g5oG X4rp t3E4 VurB m6qu VsTP vVOq");
// two complete different string
//doc[0] = std::string("t3E4 VurB m6qu VsTP");
//doc[1] = std::string("8PZW rADb g5oG X4rp");
// two exact same strings
//doc[0] = std::string("hello world!");
//doc[1] = std::string("hello world!");
break;
case 3: // two arguments given
doc[0] = std::string(argv[1]);
doc[1] = std::string(argv[2]);
break;
default: // we expect either no arguments or two arguments
std::cerr
<< "Invalid number of arguments. Call the program without arguments to calculate the"
<< "similarity of two example strings. Or call the program with two strings as argument "
<< "to compare calculate the similarity of them:" << std::endl
<< " " << argv[0] << " <string0> <string1>" << std::endl;
exit(EXIT_FAILURE); // fail
}
// create a new SubsMatch instance
subsMatch = SubsMatch::create();
// add scanners to tokenize the strings into ngrams
subsMatch->createScanner<Token::NGRAM>(1.0f,3,0); // extracts 3-grams and weight them by 1.0
subsMatch->createScanner<Token::NGRAM>(1.0f,4,1); // 4-grams but with one leak
subsMatch->createScanner<Token::NGRAM>(1.0f,5,2); // 5-grams but with two leaks
subsMatch->createScanner<Token::NGRAM>(1.5f,4,0); // extracts 4-grams and weight them by 2.0
subsMatch->createScanner<Token::NGRAM>(1.5f,5,1); // 5-grams but with one leak
subsMatch->createScanner<Token::NGRAM>(1.5f,6,2); // 6-grams but with two leaks
//
subsMatch->createScanner<Token::WORD>(2.0f);
// print what happen
std::cout << "Calculate similarity:" << std::endl;
std::cout << " string0: \"" << doc[0] << "\"" << std::endl;
std::cout << " string1: \"" << doc[1] << "\"" << std::endl;
tic = Tic(); // initialize time measurement
// scan the text and calculate the similarity of the extracted token frequencies
similarity = subsMatch->similarity(doc[0],doc[1]);
toc = Toc(tic); // stop time measurement
// print the number of extracted tokens
std::cout << " dictionary: " << subsMatch->dictionary()->count() << std::endl;
// print the over all runtime (including string scanning)
std::cout << " seconds: " << toc.seconds() << std::endl;
// print the top 20 tokens with the highest impact to the scoring
std::cout << std::endl << similarity.toString() << std::endl;
return EXIT_SUCCESS;
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment