C++ で Suffix Array

真の STL 使いは std::string とか書かずに全部 iterator でなんとかする、気がする。いや、そんなことはないか。
自分なら、比較用の functor はこう書く。

template<typename CharIterator, typename Index>
struct suffix_less : public std::binary_function<Index, Index, bool>
{
public:
    suffix_less(CharIterator first, CharIterator last) : first_(first), last_(last) {
    }

    bool operator()(Index x, Index y) const {
        return std::lexicographical_compare(first_ + x, last_, first_ + y, last_);
    }
private:
    CharIterator first_, last_;
};

std::lexicographical_compare って案外マイナーかもしれない。

  1. std::equal_range()をより広い範囲(typeid(*iter) != typeid(value))にも適用できるように拡張したオリジナルequal_range()を作る.
  2. compをオリジナルの関数オブジェクトにして,メンバに検索語を持たせる.vlaueが-1の時は検索語で,value >= 0の時は[doc.begin() + sa[value], doc.end())というsuffixだと認識するようにする.

Boost には transform_iterator というのがあるので、それを使うという手もあり。
で、まぁ、すべてが iterator、な感じでアルゴリズムを実装するとこうなるという例。

#include <iostream>
#include <functional>
#include <algorithm>
#include <iterator>
#include <vector>
#include <cstdlib>

#include <boost/iterator/transform_iterator.hpp>

template<typename Iterator, typename T>
void iota(Iterator first, Iterator last, T value)
{
    while (first != last) {
        *first++ = value++;
    }
}

template<typename CharIterator, typename Index>
struct suffix_less : public std::binary_function<Index, Index, bool>
{
public:
    suffix_less(CharIterator first, CharIterator last) : first_(first), last_(last) {
    }

    bool operator()(Index x, Index y) const {
        return std::lexicographical_compare(first_ + x, last_, first_ + y, last_);
    }
private:
    CharIterator first_, last_;
};

template<typename CharIterator, typename Index>
struct index_to_substring : public std::unary_function<Index, std::pair<CharIterator, CharIterator> >
{
    index_to_substring() { }

    index_to_substring(CharIterator first, CharIterator last, Index length) 
        : first_(first), last_(last), length_(length) {
    }

    std::pair<CharIterator, CharIterator> operator()(Index i) const {
        if (first_ + i + length_ <= last_) {
            return std::make_pair(first_ + i, first_ + i + length_);
        } else {
            return std::make_pair(first_ + i, last_);
        }
    }
private:
    CharIterator first_, last_;
    Index length_;
};

    
template<typename CharIterator, typename SAIterator>
void suffix_sort(CharIterator textFirst, CharIterator textLast,
                 SAIterator saFirst, SAIterator saLast)
{
    typedef typename std::iterator_traits<SAIterator>::value_type Index;
    suffix_less<CharIterator, Index> cmp(textFirst, textLast);

    std::sort(saFirst, saLast, cmp);
}

template<typename Iterator>
struct pair_lexicographical_compare 
    : public std::binary_function<std::pair<Iterator, Iterator>,
                                  std::pair<Iterator, Iterator>,
                                  bool>
{
    bool operator()(const std::pair<Iterator, Iterator> a,
                    const std::pair<Iterator, Iterator> b) const {
        return std::lexicographical_compare(a.first, a.second,
                                            b.first, b.second);
    }
};

template<typename Iterator, typename T, typename Comparator>
std::pair<typename std::iterator_traits<Iterator>::difference_type,
          typename std::iterator_traits<Iterator>::difference_type>
equal_range_index(Iterator first, Iterator last, T val, Comparator cmp)
{
    std::pair<Iterator, Iterator> range = std::equal_range(first, last, val, cmp);
    return std::make_pair(std::distance(first, range.first),
                          std::distance(first, range.second));
}

template<typename CharIterator, typename IndexIterator>
std::pair<typename std::iterator_traits<IndexIterator>::value_type,
          typename std::iterator_traits<IndexIterator>::value_type >
suffix_search(CharIterator textFirst, CharIterator textLast,
              IndexIterator saFirst, IndexIterator saLast,
              CharIterator wordFirst, CharIterator wordLast)
{
    typedef typename std::iterator_traits<IndexIterator>::value_type Index;
    typedef typename std::iterator_traits<CharIterator>::difference_type size_type;
    
    size_type n = std::distance(wordFirst, wordLast);
    index_to_substring<CharIterator, Index> func(textFirst, textLast, n);

    return equal_range_index(boost::make_transform_iterator(saFirst, func),
                             boost::make_transform_iterator(saLast, func),
                             std::make_pair(wordFirst, wordLast),
                             pair_lexicographical_compare<CharIterator>());
}

int main(int argc, char** argv)
{
    std::string text(std::istreambuf_iterator<char>(std::cin),
                     std::istreambuf_iterator<char>());

    std::vector<size_t> sa(text.length());
    iota(sa.begin(), sa.end(), 0);

    suffix_sort(text.begin(), text.end(), sa.begin(), sa.end());

    for (size_t i = 0; i < sa.size(); i++) {
        std::cout << i << '\t' << sa[i] << '\t' << text.substr(sa[i], std::string::npos) << std::endl;
    }

    for (int i = 1; i < argc; i++) {
        std::string search_word(argv[i]);
        std::pair<size_t, size_t> range = suffix_search(text.begin(), text.end(),
                                                        sa.begin(), sa.end(),
                                                        search_word.begin(), search_word.end());
        
        std::cout << search_word << ": [" << range.first << ", " << range.second << "]" << std::endl;
    }

    return 0;
}

なんか、半分近くが template か型か typedef の宣言になっている気がする。