// cedar -- C++ implementation of Efficiently-updatable Double ARray trie
//  $Id: bench_static.cc 1507 2013-09-17 06:33:26Z ynaga $
// Copyright (c) 2013 Naoki Yoshinaga <ynaga@tkl.iis.u-tokyo.ac.jp>
#ifdef __APPLE__
#include <mach/mach.h>
#endif
#include <unistd.h>
#include <fcntl.h>
#include <err.h>
#include <sys/time.h>
#include <cstdio>
#include <cstring>
#include <cassert>
#include <vector>
#include <string>
#include <fstream>
#include <cedar.h>
#include <trie.h>
#include <doar/double_array.h>
#include <darts.h>
#ifdef USE_DARTS_CLONE_OLD
#include <darts-clone-0.32e5.h> // header/macros renamed; Darts -> DartsClone
#else
#include <darts-clone-0.32g.h>  // header/macros renamed; Darts -> DartsClone
#endif
#include <dastrie.h>
#include <tx/tx.hpp>
#include <ux/ux.hpp>
#include <marisa.h>

// static const
static const size_t BUFFER_SIZE = 1 << 16;
// typedef
#if   defined (USE_CEDARPP)
typedef cedar::da <int, -1, -2, false>  cedar_t;
#else
typedef cedar::da <int>                 cedar_t;
#endif
typedef Darts::DoubleArray              darts_t;
typedef DartsClone::DoubleArray         dartsc_t;
typedef dutil::trie                     trie_t;
typedef dastrie::trie <int>             dastrie_t;
typedef Doar::Searcher                  doar_t;
typedef tx_tool::tx                     tx_t;
typedef ux::Trie                        ux_t;
typedef marisa::Trie                    marisa_t;

template <typename T>
inline T* create () { return new T (); }

template <>
inline trie_t* create () { return dutil::trie::create_trie (); }

template <>
inline doar_t* create () { return 0; }

size_t read_data (char* file, char*& data) {
  int fd = ::open (file, O_RDONLY);
  if (fd < 0)
    errx (1, "no such file: %s", file);
  size_t size = static_cast <size_t> (::lseek (fd, 0L, SEEK_END));
  data = new char[size];
  ::lseek (fd, 0L, SEEK_SET);
  ::read  (fd, data, size);
  ::close (fd);
  return size;
}

size_t get_process_size () {
#ifdef __APPLE__
  struct task_basic_info t_info;
  mach_msg_type_number_t t_info_count = TASK_BASIC_INFO_COUNT;
  task_info (current_task (), TASK_BASIC_INFO,
             reinterpret_cast <task_info_t> (&t_info), &t_info_count);
  return t_info.resident_size;
#else
  FILE* fp = std::fopen("/proc/self/statm", "r");
  size_t dummy (0), vm (0);
  std::fscanf (fp, "%ld %ld ", &dummy, &vm); // get resident (see procps)
  std::fclose (fp);
  return vm * ::getpagesize ();
#endif
}

#ifdef USE_BINARY_DATA
inline bool is_splitter (const char &p) { return p == '\0'; }
#else
inline bool is_splitter (char &p) { if (p != '\n') return false; p = '\0'; return true; }
#endif

// cedar/darts/darts-clone
template <typename T>
inline bool lookup_key (T* t, char* key, size_t len)
{ return t->template exactMatchSearch <int> (key, len) >= 0; }

// libtrie
template <>
inline bool lookup_key <trie_t> (trie_t* t, char* key, size_t len)
{ int val = 0; return t->search (key, len, &val); }

// dastrie
template <>
inline bool lookup_key <dastrie_t> (dastrie_t* t, char* key, size_t len)
{ int val = 0; return t->find (key, val); }

// doar
template <>
inline bool lookup_key <doar_t> (doar_t* t, char* key, size_t)
{ return t->search (key); }

// tx
template <>
inline bool lookup_key <tx_t> (tx_t* t, char* key, size_t len)
{ size_t n = 0; return t->prefixSearch (key, len, n) != tx_tool::tx::NOTFOUND && n == len; }

// ux
template <>
inline bool lookup_key <ux_t> (ux_t* t, char* key, size_t len)
{ size_t n = 0; return t->prefixSearch (key, len, n) != ux::NOTFOUND && n == len; }

// marisa
template <>
inline bool lookup_key <marisa_t> (marisa_t* t, char* key, size_t len)
{ marisa::Agent agent; agent.set_query (key); return t->lookup (agent); }

// lookup
template <typename T>
void lookup (T* t, char* data, size_t size, int& n, int& n_) {
  for (size_t start (0), end (0); end != size; ++end)
    if (is_splitter (data[end])) {
      if (lookup_key (t, data + start, end - start))
        ++n_;
      ++n;
      start = end + 1;
    }
}

// darts/darts-clone
template <typename T>
void build (T* t, int fd, int& n, char* index) {
  std::vector <const char*> key;
  std::vector <size_t>      len;
  std::vector <int>         val;
  char data[BUFFER_SIZE];
  ssize_t start (0), end (0);
  while (ssize_t size = ::read (fd, data + end, BUFFER_SIZE - end) + end) {
    for (; end != size; ++end)
      if (is_splitter (data[end])) {
        key.push_back (strdup (data + start));
        len.push_back (end - start);
        val.push_back (++n);
        start = end + 1;
      }
    std::memmove (data, data + start, size - start);
    end = size - start; start = 0;
  }
  t->build (key.size (), &key[0], &len[0], &val[0]);
  t->save (index);
  for (std::vector <const char*>::iterator it = key.begin ();
       it != key.end (); ++it)
    ::free (const_cast <char*> (*it));
}

// cedar
template <>
void build <cedar_t> (cedar_t* t, int fd, int& n, char* index) {
  char data[BUFFER_SIZE];
  ssize_t start (0), end (0);
  while (ssize_t size = ::read (fd, data + end, BUFFER_SIZE - end) + end) {
    for (; end != size; ++end)
      if (is_splitter (data[end]))
        t->update (data + start, end - start) = ++n, start = end + 1;
    std::memmove (data, data + start, size - start);
    end = size - start; start = 0;
  }
  t->save (index);
}

// libtrie
template <>
void build <trie_t> (trie_t* t, int fd, int& n, char* index) {
  char data[BUFFER_SIZE];
  ssize_t start (0), end (0);
  while (ssize_t size = ::read (fd, data + end, BUFFER_SIZE - end) + end) {
    for (; end != size; ++end)
      if (is_splitter (data[end]))
        t->insert (data + start, end - start, ++n), start = end + 1;
    std::memmove (data, data + start, size - start);
    end = size - start; start = 0;
  }
  t->build (index);
}

// doar
template <>
void build <doar_t> (doar_t* t, int fd, int& n, char* index) {
  Doar::Builder builder;
  std::vector <const char*> key;
  char data[BUFFER_SIZE];
  ssize_t start (0), end (0);
  while (ssize_t size = ::read (fd, data + end, BUFFER_SIZE - end) + end) {
    for (; end != size; ++end)
      if (is_splitter (data[end]))
        key.push_back (strdup (data + start)), ++n, start = end + 1;
    std::memmove (data, data + start, size - start);
    end = size - start; start = 0;
  }
  builder.build (&key[0], key.size ());
  builder.save (index);
  for (std::vector <const char*>::iterator it = key.begin ();
       it != key.end (); ++it)
    ::free (const_cast <char*> (*it));
}

// dastrie
template <>
void build <dastrie_t> (dastrie_t* t, int fd, int& n, char* index) {
  typedef dastrie::builder <char*, int>::record_type record_t;
  dastrie::builder <char*, int> builder;
  std::vector <record_t> key_val;
  char data[BUFFER_SIZE];
  ssize_t start (0), end (0);
  while (ssize_t size = ::read (fd, data + end, BUFFER_SIZE - end) + end) {
    for (; end != size; ++end)
      if (is_splitter (data[end]))
        { record_t r = { ::strdup (data + start), ++n }; key_val.push_back (r), start = end + 1; }
    std::memmove (data, data + start, size - start);
    end = size - start; start = 0;
  }
  builder.build (&key_val[0], &key_val[0] + key_val.size ());
  std::ofstream ofs (index, std::ios::binary); builder.write (ofs); ofs.close ();
  for (std::vector <record_t>::iterator it = key_val.begin ();
       it != key_val.end (); ++it)
    ::free (it->key);
}

// tx
template <>
void build <tx_t> (tx_t* t, int fd, int& n, char* index) {
  std::vector <std::string> str;
  char data[BUFFER_SIZE];
  ssize_t start (0), end (0);
  while (ssize_t size = ::read (fd, data + end, BUFFER_SIZE - end) + end) {
    for (; end != size; ++end)
      if (is_splitter (data[end]))
        str.push_back (data + start), ++n, start = end + 1;
    std::memmove (data, data + start, size - start);
    end = size - start; start = 0;
  }
  t->build (str, index);
}

// ux
template <>
void build <ux_t> (ux_t* t, int fd, int& n, char* index) {
  std::vector <std::string> str;
  char data[BUFFER_SIZE];
  ssize_t start (0), end (0);
  while (ssize_t size = ::read (fd, data + end, BUFFER_SIZE - end) + end) {
    for (; end != size; ++end)
      if (is_splitter (data[end]))
        str.push_back (data + start), ++n, start = end + 1;
    std::memmove (data, data + start, size - start);
    end = size - start; start = 0;
  }
  t->build (str);
  t->save (index);
}

// marisa
template <>
void build <marisa_t> (marisa_t* t, int fd, int& n, char* index) {
  marisa::Keyset keyset;
  char data[BUFFER_SIZE];
  ssize_t start (0), end (0);
  while (ssize_t size = ::read (fd, data + end, BUFFER_SIZE - end) + end) {
    for (; end != size; ++end)
      if (is_splitter (data[end]))
        keyset.push_back (data + start), ++n, start = end + 1;
    std::memmove (data, data + start, size - start);
    end = size - start; start = 0;
  }
  t->build (keyset);
  t->save (index);
}

// cedar/darts/darts-clone
template <typename T>
void read_trie (T*& t, char* index)
{ t = create <T> (); t->open (index); }

// libtrie
template <>
void read_trie <trie_t> (trie_t*& t, char* index)
{ t = trie_t::create_trie (index); }

// dastrie
template <>
void read_trie <dastrie_t> (dastrie_t*& t, char* index) {
  t = create <dastrie_t> ();
  std::ifstream ifs (index, std::ios::binary); t->read  (ifs); ifs.close ();
}

// doar
template <>
void read_trie <doar_t> (doar_t*& t, char* index)
{ t = new doar_t (index); }

// tx
template <>
void read_trie <tx_t> (tx_t*& t, char* index)
{ t = create <tx_t> (); t->read (index); }

// ux
template <>
void read_trie <ux_t> (ux_t*& t, char* index)
{ t = create <ux_t> (); t->load (index); }

// marisa-trie
template <>
void read_trie <marisa_t> (marisa_t*& t, char* index)
{ t = create <marisa_t> (); t->load (index); }

size_t get_size (char* index) {
  int fd = ::open (index, O_RDONLY);
  if (fd < 0)
    errx (1, "no such file: %s", index);
  size_t size = static_cast <size_t> (::lseek (fd, 0L, SEEK_END));
  ::close (fd);
  return size;
}

template <typename T>
void bench (const char* label, char* keys, char* index, char* queries) {
  T* t = 0;
  int n (0), n_(0);
  double elapsed = 0;
  char* data = 0;
  struct timeval start, end;
  std::fprintf (stderr, "---- %-7s ---------------------------------------\n", label);
  if (std::strcmp (keys, "-") != 0) {
    const size_t rss  = get_process_size ();
    // build trie
    t = create <T> ();
    gettimeofday (&start, NULL);
    int fd = ::open (keys, O_RDONLY);
    if (fd < 0)
      errx (1, "no such file: %s", keys);
    build (t, fd, n, index);
    ::close (fd);
    gettimeofday (&end, NULL);
    elapsed = end.tv_sec + end.tv_usec * 1e-6 - (start.tv_sec + start.tv_usec * 1e-6);
    std::fprintf (stderr, "Init RSS:            %.2f MiB (%ld bytes)\n", rss / 1048576.0, rss);
    std::fprintf (stderr, "Time to insert:      %.2f sec (%.2f nsec per key)\n",
                  elapsed, elapsed * 1e9 / n);
    std::fprintf (stderr, "Words:               %d\n\n", n);
    delete t;
    delete [] data;
  }
  // load
  const size_t rss = get_size (index);
  read_trie (t, index);
  std::fprintf (stderr, "Trie size:           %.2f MiB (%ld bytes)\n", rss / 1048576.0, rss);
  // load data
  if (std::strcmp (queries, "-") != 0) {
    const size_t size = read_data (queries, data);
    // search
    n = n_ = 0;
    gettimeofday (&start, NULL);
    lookup (t, data, size, n, n_);
    gettimeofday (&end, NULL);
    elapsed = end.tv_sec + end.tv_usec * 1e-6 - (start.tv_sec + start.tv_usec * 1e-6);
    std::fprintf (stderr, "Time to search:      %.2f sec (%.2f nsec per key)\n",
                  elapsed, elapsed * 1e9 / n);
    std::fprintf (stderr, "Words:               %d\n", n);
    std::fprintf (stderr, "Found:               %d\n", n_);
    delete [] data;
  }
  delete t;
}

int main (int argc, char** argv) {
  if (argc < 4)
    ::errx (1, "Usage: %s keys index test", argv[0]);
  //
#ifdef USE_CEDAR
  bench <cedar_t>   ("cedar",           argv[1], argv[2], argv[3]);
#endif
#ifdef USE_CEDARPP
  bench <cedar_t>   ("cedar++",         argv[1], argv[2], argv[3]);
#endif
#ifdef USE_LIBTRIE
  bench <trie_t>    ("libtrie",         argv[1], argv[2], argv[3]);
#endif
#ifdef USE_DOAR
  bench <doar_t>    ("doar",            argv[1], argv[2], argv[3]);
#endif
#ifdef USE_DARTS
  bench <darts_t>   ("darts",           argv[1], argv[2], argv[3]);
#endif
#ifdef USE_DARTS_CLONE
  bench <dartsc_t>  ("darts-clone",     argv[1], argv[2], argv[3]);
#endif
#ifdef USE_DARTS_CLONE_OLD
  bench <dartsc_t>  ("darts-clone_old", argv[1], argv[2], argv[3]);
#endif
#ifdef USE_DASTRIE
  bench <dastrie_t> ("dastrie",         argv[1], argv[2], argv[3]);
#endif
#ifdef USE_TX
  bench <tx_t>      ("tx",              argv[1], argv[2], argv[3]);
#endif
#ifdef USE_UX
  bench <ux_t>      ("ux",              argv[1], argv[2], argv[3]);
#endif
#ifdef USE_MARISA
  bench <marisa_t>  ("marisa-trie",     argv[1], argv[2], argv[3]);
#endif
}
/*
  g++ -DHAVE_CONFIG_H -I. -I.. -I$HOME/local/include -O2 -g bench_static.cc -o bench_static_marisa -L$HOME/local/lib -ltx -lux -lmarisa -ltrie
  g++ -DHAVE_CONFIG_H -DUSE_BINARY_DATA -I. -I.. -I$HOME/local/include -O2 -g bench_static.cc -o bench_static_marisa_bin -L$HOME/local/lib -ltx -lux -lmarisa -ltrie
*/
