Commit 62a7ddc3 authored by David Geisler's avatar David Geisler

subsmatch added

parent 9e3c3b5f
......@@ -158,7 +158,7 @@ namespace utue::pe::tools::lsh {
/**
* Number of hashes/permutations
*/
unsigned int m_hashes{};
unsigned int m_hashes;
/**
* Number of buckets per hash table
......
/****************************************************************************
* Copyright (C) 2019 by David Geisler <david.geisler@uni-tuebingen.de> *
* *
* This file is part of the Perception Engineering Toolbox 1.0. *
* Redistribution and use in source and binary forms, with or without *
* modification, are permitted provided that the following conditions *
* are met: *
* *
* 1. Redistributions of source code must retain the above copyright *
* notice, this list of conditions and the following disclaimer. *
* 2. Redistributions in binary form must reproduce the above copyright *
* notice, this list of conditions and the following disclaimer in the *
* documentation and/or other materials provided with the distribution. *
* 3. Neither the name of the copyright holder nor the names of its *
* contributors may be used to endorse or promote products derived from *
* this software without specific prior written permission. *
* *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS *
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT *
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A *
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT *
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, *
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED *
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR *
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
* *
* Correspondence should be directed to *
* Eberhard Karls Universität Tübingen: *
* *
* Eberhard Karls Universität Tübingen *
* Mathematisch-Naturwissenschaftliche Fakultät *
* Technische Informatik - Perception Engineering *
* David Geisler *
* Sand 14 *
* D-72076 Tübingen *
* GERMANY *
* www.uni-tuebingen.de/en *
* *
* email: david.geisler@uni-tuebingen.de *
****************************************************************************/
/**
* @author David Geisler
* @copyright Copyright (C) 2019 by David Geisler <david.geisler@uni-tuebingen.de>
* @date 22 Oct 2019
*
* @package utue.pe.tools.lsh
*
* @brief Defines an object that implements a SubsMatch based text similarity.
*/
#ifndef UTUE_PE_TOOLS_LSH_SUBSMATCH_H
#define UTUE_PE_TOOLS_LSH_SUBSMATCH_H
#include <utue/pe/tools/lsh/Document.h>
#include <utue/pe/utils/HashSet.h>
#include <memory>
namespace utue::pe::tools::lsh {
/**
* @brief Implements a SubsMatch algorithm to estimate the similarity of multiple Documents
*/
class SubsMatch {
public:
/**
* @brief Stores the result of one MinHash similarity calculation
*/
class Similarity {
public:
/**
* The left Document
*/
DocumentPtr m_left;
/**
* The right Document
*/
DocumentPtr m_right;
/**
* Similarity score
*/
float m_similarity;
/**
* Processing time in seconds
*/
float m_seconds;
/**
* Creates a new Similarity object
* @param left Left Document
* @param right Right Document
*/
Similarity(DocumentPtr left, DocumentPtr right);
/**
* Creates an empty Similarity object
*/
Similarity();
[[nodiscard]] std::string toString() const;
};
private:
/**
* Dictionary of all related Documents
* @see Dictionary
*/
DictionaryPtr m_dictionary;
/**
* Scanner to extract Tokens from text
* @see Scanner
*/
ScannerPtr m_scanner;
public:
/**
* Creates an empty SubsMatch object
* @param buckets Number of buckets in the chained hash set of Tokens
*/
explicit SubsMatch();
/**
* Creates a new document
* @return document Empty document linked to the MinHash Dictionary and Scanner
*/
[[nodiscard]] DocumentPtr createDocument();
/**
* Creates a new document
* @param text content of the document
* @return Document with initialized contend linked to the MinHash Dictionary and Scanner
*/
[[nodiscard]] DocumentPtr createDocument(const std::string& text);
/**
* creates a new token scanner
* @tparam TokenType Type of the scanner (eg NGRAM)
* @tparam args Type of the scanner arguments
* @param values scanner arguments
* @return token scanner
*/
template<Token::Type TokenType, typename... args>
ScannerPtr createScanner(args... values) {
ScannerPtr scanner;
scanner = Scanner::create<TokenType>(values...);
this->add(scanner);
return scanner;
}
/**
* Calculates the similarity of to Documents
* @param similarity Read/Write container with two documents
*/
void similarity(Similarity& similarity);
/**
* Calculates the similarity of to Documents
* @param left The first Document
* @param right The second Document
* @return Similarity of two Documents
*/
[[nodiscard]] Similarity similarity(const DocumentPtr& left, const DocumentPtr& right);
/**
* Calculates the similarity of to string
* @param left The first string
* @param right The second string
* @return Similarity of two strings
*/
[[nodiscard]] Similarity similarity(const std::string& left, const std::string& right);
/**
* Add a Scanner to the MinHash
* @param scanner Scanner to add
*/
void add(const ScannerPtr& scanner);
/**
* Returns the dictionary
* @return Dictionary
*/
[[nodiscard]] const DictionaryPtr& dictionary() const;
/**
* Creates a SubsMatch object
* @return Pointer to the created SubsMatch object
*/
[[nodiscard]] static std::shared_ptr<SubsMatch> create();
};
/**
* Pointer to a SubsMatch object
*/
typedef std::shared_ptr<SubsMatch> SubsMatchPtr;
}
#endif //UTUE_PE_TOOLS_LSH_SUBSMATCH_H
......@@ -43,6 +43,6 @@
############################################################################
#add_library(pe-tools-lsh STATIC Document.cpp;Token.cpp;NGram.cpp;Word.cpp)
add_library(pe-tools-lsh STATIC Token.cpp;Scanner.cpp;Dictionary.cpp;Document.cpp;MinHash.cpp;NGram.cpp;Word.cpp)
add_library(pe-tools-lsh STATIC Token.cpp;Scanner.cpp;Dictionary.cpp;Document.cpp;MinHash.cpp;NGram.cpp;Word.cpp;SubsMatch.cpp)
ADDSUBDIRS()
\ No newline at end of file
/****************************************************************************
* Copyright (C) 2019 by David Geisler <david.geisler@uni-tuebingen.de> *
* *
* This file is part of the Perception Engineering Toolbox 1.0. *
* Redistribution and use in source and binary forms, with or without *
* modification, are permitted provided that the following conditions *
* are met: *
* *
* 1. Redistributions of source code must retain the above copyright *
* notice, this list of conditions and the following disclaimer. *
* 2. Redistributions in binary form must reproduce the above copyright *
* notice, this list of conditions and the following disclaimer in the *
* documentation and/or other materials provided with the distribution. *
* 3. Neither the name of the copyright holder nor the names of its *
* contributors may be used to endorse or promote products derived from *
* this software without specific prior written permission. *
* *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS *
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT *
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A *
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT *
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, *
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED *
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR *
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
* *
* Correspondence should be directed to *
* Eberhard Karls Universität Tübingen: *
* *
* Eberhard Karls Universität Tübingen *
* Mathematisch-Naturwissenschaftliche Fakultät *
* Technische Informatik - Perception Engineering *
* David Geisler *
* Sand 14 *
* D-72076 Tübingen *
* GERMANY *
* www.uni-tuebingen.de/en *
* *
* email: david.geisler@uni-tuebingen.de *
****************************************************************************/
/**
* @author David Geisler
* @copyright Copyright (C) 2019 by David Geisler <david.geisler@uni-tuebingen.de>
* @date 22 Oct 2019
*
* @package utue.pe.tools.lsh
*
* @todo add brief description
*/
#include <utue/pe/tools/lsh/SubsMatch.h>
#include <utue/pe/tools/lsh/Scanner.h>
#include <utue/pe/utils/Random.h>
#include <memory>
#include <cmath>
#include <utility>
#include <utue/pe/utils/TicToc.h>
#include <sstream>
#include <iomanip>
namespace utue::pe::tools::lsh {
SubsMatch::Similarity::Similarity(DocumentPtr left, DocumentPtr right) :
m_left(std::move(left)),
m_right(std::move(right)),
m_similarity(0.0f),
m_seconds(0.0f) { }
SubsMatch::Similarity::Similarity() :
m_left(),
m_right(),
m_similarity(0.0f),
m_seconds(0.0f) { }
std::string SubsMatch::Similarity::toString() const {
std::stringstream ss;
unsigned int count;
ss << " --------------- R E P O R T ---------------" << std::endl;
ss << " Runtime: " << this->m_seconds << " seconds" << std::endl;
ss << " Similarity: " << this->m_similarity << std::endl;
return ss.str();
}
class SubsMatchScanner : public Scanner {
private:
std::vector<ScannerPtr> m_scanners;
public:
SubsMatchScanner() = default;
void scan(const std::string& text, std::function<void(const Token&)> callback) override {
for(ScannerPtr& scanner : this->m_scanners)
scanner->scan(text,callback);
}
void add(const ScannerPtr& scanner) {
this->m_scanners.emplace_back(scanner);
}
};
SubsMatch::SubsMatch() :
m_dictionary(new Dictionary()),
m_scanner(new SubsMatchScanner()) {
}
DocumentPtr SubsMatch::createDocument() {
return std::make_shared<Document>(this->m_dictionary,this->m_scanner);
}
DocumentPtr SubsMatch::createDocument(const std::string& text) {
DocumentPtr doc;
doc = this->createDocument();
doc->scan(text);
return doc;
}
void SubsMatch::similarity(Similarity& similarity) {
utils::Tic tic;
float leftFreq;
float rightFreq;
float diffFreq;
float weight;
float sqSum;
float mxSum;
sqSum = 0.0f;
mxSum = 0.0f;
for(const auto& bucket : this->m_dictionary->tokens().fastIterator()) {
for(const std::pair<Token,unsigned int>& token : *bucket) {
leftFreq = similarity.m_left->frequency(token.first);
rightFreq = similarity.m_right->frequency(token.first);
if(leftFreq == 0.0f && rightFreq == 0.0f)
continue;
weight = token.first.weight();
diffFreq = leftFreq-rightFreq;
sqSum += weight*diffFreq*diffFreq;
mxSum += mxSum;
}
}
similarity.m_similarity = sqrtf(mxSum)-sqrtf(sqSum);
similarity.m_seconds = utils::Toc(tic).seconds();
}
SubsMatch::Similarity SubsMatch::similarity(const DocumentPtr& left, const DocumentPtr& right) {
Similarity similarity(left,right);
this->similarity(similarity);
return similarity;
}
SubsMatch::Similarity SubsMatch::similarity(const std::string& left, const std::string& right) {
return this->similarity(this->createDocument(left),this->createDocument(right));
}
void SubsMatch::add(const ScannerPtr& scanner) {
((SubsMatchScanner*)this->m_scanner.get())->add(scanner);
}
const DictionaryPtr& SubsMatch::dictionary() const {
return this->m_dictionary;
}
std::shared_ptr<SubsMatch> SubsMatch::create() {
return std::make_shared<SubsMatch>();
}
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment