#ifndef EVALUATE_FUNCTION_H
#define EVALUATE_FUNCTION_H

#define EF_ASKING_TAG  4000
#define EF_INDICES_TAG 4001
#define EF_DATA_TAG    4002

#include "itl/interface/detail/pattern_finder.h"


namespace itl {

//used to transform global index to local one
template <class Vec>
struct vec_aux {
  inline vec_aux(Vec& _v, int d) : v(_v), displacement(d) {}

  typedef typename Vec::value_type value_type;

  inline value_type& operator[](int i)  {
    return v[i-displacement];
  }

  inline value_type operator[](int i) const {
    return v[i-displacement];
  }

protected:
  int displacement;
  Vec& v;
};

template <class T, class Eval>
class evaluate_function {
public:
  typedef T value_type;

  evaluate_function(const evaluate_function& ef) 
    : ref_counter(ef.ref_counter),
      eval(ef.eval),
      buffer(ef.buffer),
      pos(ef.pos), 
      request(ef.request),
      EFtype(ef.EFtype),
      total_num_send_recv(ef.total_num_send_recv) {
    ++(*ref_counter);
  }

  evaluate_function(int n, int _pos, Eval _e) : pos(_pos), eval(_e) {
    ref_counter = new int(1);
    
    buffer = new value_type[n];
    
    int processors = Manager::size();
    int rank       = Manager::rank();

    request = new MPI_Request[processors*2];

    int* send_sizes = new int[processors];
    int* recv_sizes = new int[processors];
    for (int i=0; i<processors; i++)
      recv_sizes[i] = send_sizes[i] = 0;

    int* recv_displacements = new int[n];
    int* send_displacements = new int[n];

    for (int i=0; i<n; ++i) {
      recv_displacements[i] = 0;
      send_displacements[i] = 0;
    }

    //post recv asking message
    int count = 0;
    for (int i=0; i<processors; i++) {
      if ( i != rank ) {
	MPI_Irecv( send_sizes+i, 1, MPI_INT, i, 
		   EF_ASKING_TAG, Manager::comm(), 
		   &request[count++]);
      }
    }

    pattern_finder pf(recv_displacements);
    pfinder_aux    pf_aux;

    //find access pattern
    eval._evaluate(pf, pf_aux);

    //make index contiguous
    for (int i=0; i<processors; i++) {
      if ( i == rank ) continue;
      int b = i*pos;
      int k=0;
      int* rind = recv_displacements + b;

      for (int j=0; j<pos; j++)
	if ( rind[j] )
	  rind[k++] =  j; //local index

      recv_sizes[i] = k;
    }
    //leftover
    int b = (processors-1)*pos + recv_sizes[processors-1];
    int k = recv_sizes[processors-1];
    for (int i = processors*pos; i < n; i++)
      if ( recv_displacements[i] ) {
	recv_displacements[b++] = i - (processors-1)*pos; //global index
	k++;
      }
    recv_sizes[processors-1] = k;

    count          = processors-1;
    for (int i=0; i<processors; i++) {
      if ( i != rank ) {
	MPI_Isend( recv_sizes+i, 1, MPI_INT, i,
		   EF_ASKING_TAG, Manager::comm(), request + count++);
      }
    }

    //send recv_sizes, recv send_sizes
    MPI_Waitall(count, request, MPI_STATUSES_IGNORE);

    count = 0;
    for (int i=0; i<processors; i++) {
      if ( i != rank && send_sizes[i] ) {
	MPI_Irecv( send_displacements + i*pos, send_sizes[i], MPI_INT, i, 
		   EF_INDICES_TAG, Manager::comm(), 
		   &request[count++]);
      }
    }

    count          = processors-1;
    for (int i=0; i<processors; i++) {
      if ( i != rank && recv_sizes[i] ) {

	MPI_Isend( recv_displacements+i*pos , recv_sizes[i], MPI_INT, i,
		   EF_INDICES_TAG, Manager::comm(), request + count++);
      }
    }

    //complete send request, (blocks is the array to send 1 or 0)
    MPI_Waitall(count, request, MPI_STATUSES_IGNORE);

    int* blocks = new int [n-(processors-1)*pos];
    for (int i=0; i < n-(processors-1)*pos ; i++)
      blocks[i] = 1;

    EFtype = new MPI_Datatype [processors*2];
    count = 0;
    for (int i=0; i<processors; i++) 
      if ( i != rank && send_sizes[i] != 0 ) {
	MPI_Type_indexed(send_sizes[i], blocks, send_displacements+i*pos,
			 MPI_DOUBLE, & EFtype[count]);
	MPI_Type_commit(&EFtype[count]);
	count++;
      }

    for (int i=0; i<processors; i++) 
      if ( i != rank && recv_sizes[i] != 0 ) {
	MPI_Type_indexed(recv_sizes[i], blocks, recv_displacements+i*pos,
			 MPI_DOUBLE, & EFtype[count]);
	MPI_Type_commit(&EFtype[count]);
	count++;
      }

    count = 0;
    //persisitent send
    for (int i=0; i<processors; i++) 
      if ( i!=rank && send_sizes[i] !=0 ) {
	MPI_Send_init( buffer+rank*pos, 1, EFtype[count], i, 
		       EF_DATA_TAG, Manager::comm(),
		       request + count);
	count++;
      }

    //persistent recv
    for (int i=0; i<processors; i++)
      if ( i!=rank && recv_sizes[i] !=0 ) {
	MPI_Recv_init( buffer+i*pos, 1, EFtype[count], i, 
		       EF_DATA_TAG, Manager::comm(), 
		       request + count);
	count++;
      }

    total_num_send_recv = count;

    delete [] blocks;
    delete [] recv_displacements;
    delete [] send_displacements;
    delete [] send_sizes;
    delete [] recv_sizes;    
  }

  inline ~evaluate_function() {
    --(*ref_counter);
    if ( ! *ref_counter ) {
      delete ref_counter;
      delete [] buffer;
      delete [] request;
      delete [] EFtype;
    }
  }

  void cleanup() {
    for (int i=0; i<total_num_send_recv; i++) {
      //need free MPI_request here
      MPI_Request_free(request+i);
      //need free MPI_Type
      MPI_Type_free(EFtype+i);
    }
  }

  template <class VecX, class VecF>
  inline void evaluate(const VecX& x, VecF& f) const {
    process_vector(x);
    evaluate(f);
  }

  template <class VecX, class VecF>
  inline void operator()(const VecX& x, VecF& f) const {
    evaluate(x, f);
  }
protected:
  template <class Vector>
  void process_vector(const Vector& x) const {
    int rank = Manager::rank();
    std::copy(x.begin(), x.end(), buffer + pos*rank);

    MPI_Startall(total_num_send_recv, request);

    //wait all sending/recving data 
    MPI_Waitall(total_num_send_recv, request, MPI_STATUSES_IGNORE);
  }

  //after process_vector, buffer is global x array except those entries
  //that will not be accessed.
  template <class Vec>
  inline void evaluate(Vec& f) const {
    //x_ is an adaptor for index transformation.
    vec_aux<Vec> f_(f, Manager::rank()*pos);

    eval._evaluate(buffer, f_);
  }


  int* ref_counter;
  Eval eval;
  mutable value_type* buffer;
  int pos; //index interval between two neighbor cpus
  MPI_Request* request;
  MPI_Datatype* EFtype;
  int total_num_send_recv;
};

} //namespace itl

#endif
