Commit b569a6a4 authored by David Geisler's avatar David Geisler

provide different iterators for full bucket iteration (slow) and active bucket

iteration (fast)
parent 7a044396
......@@ -68,9 +68,70 @@ namespace utue::pe::utils {
class HashSet {
public:
/**
* Bucket iterator type
* Fast bucket iterator type
*/
typedef typename std::list<std::list<T>*>::const_iterator BucketIterator;
typedef typename std::list<std::list<T>*>::iterator FastBucketIterator;
/**
* Slow bucket iterator type
*/
typedef typename std::vector<std::list<T>>::iterator SlowBucketIterator;
/**
* Fast bucket iterator type
*/
typedef typename std::list<std::list<T>*>::const_iterator FastConstBucketIterator;
/**
* Slow bucket iterator type
*/
typedef typename std::vector<std::list<T>>::const_iterator SlowConstBucketIterator;
/**
* Bucket iterator container
* @tparam IteratorType Type of the Iterator (either FastBucketIterator or SlowBucketIterator)
*/
template<typename IteratorType>
class Iterator {
private:
/**
* Read/Write iterator that points to the first bucket
*/
IteratorType m_begin;
/**
* Read/write iterator that points one past the last bucket
*/
IteratorType m_end;
public:
/**
* Create a new bucket iterator container
* @param begin Read/Write iterator that points to the first bucket
* @param end Read/write iterator that points one past the last bucket
*/
Iterator(const IteratorType& begin, const IteratorType& end) : m_begin(begin), m_end(end) {
}
/**
* Returns a read/write iterator that points to the first
* bucket. Iteration is done in ordinary element order.
* @return Iterator that points to the first bucket.
*/
IteratorType begin() {
return this->m_begin;
}
/**
* Returns a read/write iterator that points one past the last
* bucket. Iteration is done in ordinary element order.
* @return Iterator that points one past to the last bucket.
*/
IteratorType end() {
return this->m_end;
}
};
protected:
/**
......@@ -132,22 +193,40 @@ namespace utue::pe::utils {
}
/**
* Returns a read/write iterator that points to the first
* bucket. Iteration is done in ordinary element order.
* @return Iterator that points to the first bucket.
* Returns a iterator container for fast bucket iteration.
* Only active buckets will be considered.
* @return Iterator container
*/
BucketIterator begin() const {
return this->m_active_buckets.begin();
};
Iterator<FastBucketIterator> fastIterator() {
return Iterator<FastBucketIterator>(this->m_active_buckets.begin(),this->m_active_buckets.end());
}
/**
* Returns a read/write iterator that points one past the last
* bucket. Iteration is done in ordinary element order.
* @return Iterator that points one past to the last bucket.
* Returns a iterator container for slow bucket iteration.
* All buckets will be considered, if active or not.
* @return Iterator container
*/
BucketIterator end() const{
return this->m_active_buckets.end();
};
Iterator<SlowBucketIterator> slowIterator() {
return Iterator<SlowBucketIterator>(this->m_buckets.begin(),this->m_buckets.end());
}
/**
* Returns a iterator container for fast bucket iteration.
* Only active buckets will be considered.
* @return Iterator container
*/
Iterator<FastConstBucketIterator> fastIterator() const {
return Iterator<FastConstBucketIterator>(this->m_active_buckets.begin(),this->m_active_buckets.end());
}
/**
* Returns a iterator container for slow bucket iteration.
* All buckets will be considered, if active or not.
* @return Iterator container
*/
Iterator<SlowConstBucketIterator> slowIterator() const {
return Iterator<SlowConstBucketIterator>(this->m_buckets.begin(),this->m_buckets.end());
}
protected:
......
......@@ -133,9 +133,12 @@ namespace utue::pe::tools::lsh {
for(const auto& permutation : this->m_permutations) {
// skip if permutation list is empty
for(const auto& bucket : permutation) {
for(const auto& bucket : permutation.slowIterator()) { //Its important to use the slow iterator here!
//The fast iterator almost depicts the insertion
//order of the Tokens. The permutation is therefore
//weak, and earlier Tokens are higher weighted.
// iterate over tokens in permutation list
for (const auto& token : *bucket) {
for (const auto& token : bucket) {
//todo: in worst case both documents are
// disjoint and we run through all
// tokens
......@@ -152,8 +155,12 @@ namespace utue::pe::tools::lsh {
// word appears at least in one of both documents
if(leftFreq > 0.0f && rightFreq > 0.0f)
// Token appears in both documents
// A low overall dictionary frequency leads to the conclusion that the Token is more expressive
// for those documents. Its therefore higher weighted by the division of the low frequency.
weight = token.weight()/dictFreq;
else
// Token appears only in one of both documents
weight = token.weight()*dictFreq;
for(unsigned int j = 0; j < thresholds; j++) {
......@@ -174,6 +181,7 @@ namespace utue::pe::tools::lsh {
if(count > 0.0f)
return score/count;
return 0.0f;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment