//===========================================================================
//  CVS Information:                                                         
//                                                                           
//     $RCSfile: bratu.cpp,v $  $Revision: 1.4 $  $State: Exp $ 
//     $Author: llee $  $Date: 2001/10/26 16:55:38 $ 
//     $Locker:  $ 
//---------------------------------------------------------------------------
//                                                                           
// DESCRIPTION                                                               
//  parallel bratu problem, using Newton-Krylov method
//
//  -Laplacian(U) - lambda * exp (U) = 0
//  u = 0 at boundary. lambda = 6.0
//                                                                           
// 
//  nonlinear equations: f(i, j) = 0 where
//  f(i,j) =  U(i,j)                                       if i,j = 0, n
//  f(i,j) =  4U(i,j)-U(i-1,j)-U(i+1,j)-U(i,j-1)-U(i,j+1)  otherwise
//                              -hx*hy*lambda*exp(U(i,j))
// 
//---------------------------------------------------------------------------
//                                                                           
// LICENSE AGREEMENT                                                         
//=======================================================================
// Copyright (C) 1997-2001
// Authors: Andrew Lumsdaine <lums@osl.iu.edu> 
//          Lie-Quan Lee     <llee@osl.iu.edu>
//
// This file is part of the Iterative Template Library
//
// You should have received a copy of the License Agreement for the
// Iterative Template Library along with the software;  see the
// file LICENSE.  
//
// Permission to modify the code and to distribute modified code is
// granted, provided the text of this NOTICE is retained, a notice that
// the code was modified is included with the above COPYRIGHT NOTICE and
// with the COPYRIGHT NOTICE in the LICENSE file, and that the LICENSE
// file is distributed with the modified code.
//
// LICENSOR MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED.
// By way of example, but not limitation, Licensor MAKES NO
// REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS FOR ANY
// PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE COMPONENTS
// OR DOCUMENTATION WILL NOT INFRINGE ANY PATENTS, COPYRIGHTS, TRADEMARKS
// OR OTHER RIGHTS.
//=======================================================================
//---------------------------------------------------------------------------
//                                                                           
// REVISION HISTORY:                                                         
//                                                                           
// $Log: bratu.cpp,v $
// Revision 1.4  2001/10/26 16:55:38  llee
// MPISpro 7.3 test finished
//
// Revision 1.3  2001/10/23 16:15:01  llee
// *** empty log message ***
//
// Revision 1.2  2001/10/19 15:27:40  llee
// parallel bratu examples: done
//
// Revision 1.1  2001/10/19 05:23:06  llee
// *** empty log message ***
//                                                                           
//===========================================================================
#include <fstream>
#include <iostream>
#include <vector>
#include <iterator>

#include <mtl/dense1D.h>
#include <mtl/matrix.h>
#include <mtl/mtl.h>

#include "itl/interface/mtl_parallel.h"
#include <itl/krylov/gmres.h>
#include <itl/krylov/bicgstab.h>
#include <itl/krylov/cgs.h>
#include <itl/krylov/tfqmr.h>
#include "itl/preconditioner/block_ilu.h"

#include "parser.h"

using namespace itl;

typedef mtl::matrix<double, mtl::rectangle<>,
		    mtl::compressed<>,
		    mtl::row_major>::type Matrix;
typedef mtl::matrix<double, mtl::rectangle<>,
		    mtl::array<mtl::compressed<> >,
		    mtl::row_major>::type InitMatrix;

double lamhxhy;

//BZ_DECLARE_STENCIL2(Bratu_stencil, U, F)
//  F = -Laplacian2D(U) - exp(U) * lamhxhy;
//BZ_END_STENCIL

//#define MIN(X,Y) where((X) < (Y), (X), (Y))

/*
  Except values on boundary, 
  F = -Laplacian * U - lambda*hx*hy*exp(U)
  On the boundary,
  F = U
 */
template <class ParallelMatrix>
struct apply_op {
  apply_op(ParallelMatrix& AA, int disp_, int nx_, int ny_)
    : A(AA), disp(disp_), nx(nx_), ny(ny_) {}

  template <class Vector1DU, class Vector1DF>
  void operator()( Vector1DU& U, Vector1DF& F) {
    //change A to -Laplacian
    typename ParallelMatrix::matrix_type AA = A.local_matrix();

    int index_min = disp;
    int index_max = disp + F.size() - 1;

    { //find the diagonal, set it to be 4.0
      int local_nx_begin = 0;
      int local_nx_end   = nx;

      int ny_pos = ny / Manager::size();
      int local_ny_begin = Manager::rank()*ny_pos;
      int local_ny_end   = local_ny_begin + ny_pos;
      if ( Manager::rank() == Manager::size() - 1 ) //last processor
	local_ny_end = ny;

      int i, j;

      for (j=local_ny_begin; j<local_ny_end; ++j) {
	int index_base = j * nx;
	if (j == 0 || j == ny-1)
	  continue;
	
	for (i=local_nx_begin; i<local_nx_end; ++i) {
	  
	  int index = index_base + i;
	  int row = index - index_min;
	  
	  if (i == 0 || i == nx-1)
	    continue;

	  AA(row, index) = 4.0;
	}
      }
    }

    itl::mult(A, U, F);

    //add diagonal part
    for (int i=0; i<F.size(); i++) {
      F[i] -= lamhxhy*exp(U[i]);
    }

    //The following is necessary since above loop covers all the elements in F.
    //However, for those elements at the boundary, above operation should not
    //happen. Therefore, we set the correct value back.
#if 1
     //j=0
    for (int i=0; i<nx; i++) {
      int index = i * ny;
      if ( index < index_min ) continue;
      if ( index > index_max ) break;
      int row = index - index_min;
      F[row] = U[row];
    }
    //j=ny-1;
    for (int i=0; i<nx; i++) {
      int index = i * ny + ny - 1;
      if ( index < index_min ) continue;
      if ( index > index_max ) break;
      int row = index - index_min;
      F[row] = U[row];
    }
    //i=0
    for (int j=0; j<ny; j++) {
      int index = j;
      if ( index < index_min ) continue;
      if ( index > index_max ) break;
      int row = index - index_min;
      F[row] = U[row];
    }
    //i=nx-1
    for (int j=0; j<ny; j++) {
      int index = (nx-1) * ny + j;
      if ( index < index_min ) continue;
      if ( index > index_max ) break;
      int row = index - index_min;
      F[row] = U[row];
    }
#endif   
  }
  ParallelMatrix& A;
  int disp; //displacement
  int nx;
  int ny;
};

template <class T>
const T& MIN(const T& a, const T& b)
{
  if (a < b) return a;
  else return b;
}

typedef mtl::dense1D<double> Vector;

std::ostream& operator<< (std::ostream& out, const Vector& v) // 
{
  //std::copy(v.begin(), v.end(), std::ostream_iterator<double, char>(out, " "));
  for (size_t i=0; i<v.size(); ++i)
    std::cout << v[i] << " ";
  out << std::endl;
  return out;
}

int 
main(int argc, char *argv[])
{
  using std::cout;
  using std::endl;

  Manager::init(argc, argv);

  int ksp_monitor = 0;
  int is_precond  = 0;
  int kmax;
  int iter_max;
  int restart;
  std::string ksp_type;

  double ksp_rtol, ksp_atol, newton_rtol, newton_atol, newton_stol;

  int nx, ny;

  std::parser myparser(argc, argv);

  myparser.register_flag("--mx", 1, "number of points in x axis in the mesh");
  myparser.register_flag("--my", 1, "number of points in y axis in the mesh");
  myparser.register_flag("--ksp_max_it", 1, "maxmal iterations allowed in KSP");  
  myparser.register_flag("--ksp_atol", 1, "Absolute tolerance in KSP iteration");  
  myparser.register_flag("--ksp_rtol", 1, "Relative tolerance in KSP iteration");  
  myparser.register_flag("--newton_rtol", 1, "Relative tolerance in Newton iteration");
  myparser.register_flag("--newton_stol", 1, "Relative tolerance in two continuous Newton iteration");
  myparser.register_flag("--newton_atol", 1, "Absolute tolerance in Newton iteration");
  myparser.register_flag("--gmres_restart", 1, "GMRES restart");
  myparser.register_flag("--newton_max_it", 1, "Maximum iterations in Newton");
  myparser.register_flag("--ksp_monitor", 0, "Monitoring KSP tolerance.");
  myparser.register_flag("--is_precond", 0, "Precondtioning");
  myparser.register_flag("--ksp_type", 1, "KSP type.");

  myparser.help();
 
  myparser.get("--mx", nx, Manager::size()*8);
  myparser.get("--my", ny, nx);
  myparser.get("--ksp_max_it", iter_max, 256);
  myparser.get("--ksp_atol", ksp_atol, 1.0e-50);
  myparser.get("--ksp_rtol", ksp_rtol, 1.0e-8);
  myparser.get("--newton_rtol", newton_rtol, 1.0e-8);
  myparser.get("--newton_stol", newton_stol, 1.0e-8);
  myparser.get("--newton_atol", newton_atol, 1.0e-50);
  myparser.get("--gmres_restart", restart, 60);
  myparser.get("--newton_max_it", kmax, 60);
  myparser.get("--ksp_monitor", ksp_monitor, 0);
  myparser.get("--is_precond", is_precond, 0);
  myparser.get("--ksp_type", ksp_type, std::string("gmres"));
  

  if ( !Manager::rank() ) {
    cout << "Relative tolerance for Newton iteration: " << newton_rtol << endl
	 << "Absolute tolerance for Newton iteration: " << newton_atol << endl
	 << "Secussive tolerance for Newton iteration: " << newton_stol << endl
	 << "Relative tolerance for KSP iteraion: " << ksp_rtol << endl
	 << "Absolute tolerance for KSP iteraion: " << ksp_atol << endl;
  }

  const double lambda = 6.0;
  int msize = nx * ny;
  int pos = msize / Manager::size();
  
  int local_nrows = pos;
  if ( Manager::rank() == Manager::size() - 1 ) //last processor
    local_nrows = msize - Manager::rank() * pos;

  int local_nx_begin = 0;
  int local_nx_end   = nx;

  int ny_pos = ny / Manager::size();
  int local_ny_begin = Manager::rank()*ny_pos;
  int local_ny_end   = local_ny_begin + ny_pos;
  if ( Manager::rank() == Manager::size() - 1 ) //last processor
    local_ny_end = ny;

  Vector U(local_nrows, 0.0);
  Vector F(local_nrows, 0.0);
  Vector DX(local_nrows, 0.0);

  const double hx = 1./(nx-1), hy = 1./(ny-1);
  lamhxhy = lambda * hx * hy;
  
  if ( !Manager::rank() )
    cout << "( "<< Manager::size() 
	 << " CPUs) Bratu problem: " << nx << " x " << ny << endl;

  {
    //initialize U
    /*
      for(int ix=1; ix<nx-2; ix++)
      for(int jx=1; jx<ny-2; jx++)
      U(ix+1, jx+1) =  (lambda / (lambda+1.0)) *
      sqrt(MIN(hy * MIN(jx, ny-jx-1), hx * MIN(ix,nx-ix-1)));
    */
    
    /* It seems that above initialization is wrong. 

       The correct one is following: 

       for(int ix=1; ix<nx-1; ix++)
       for(int jx=1; jx<ny-1; jx++)
          U(ix, jx) = (lambda / (lambda+1.0)) *
	              sqrt(MIN(hy * MIN(jx, ny-jx-1), hx * MIN(ix,nx-ix-1)));
      
     */
    int index_min = pos * Manager::rank();
    int index_max = index_min + local_nrows - 1;
    
    for(int ix=1; ix<nx-1; ix++) {
      int index_base = (ix) * ny;
      if ( index_base > index_max ) break;

      for(int jx=1; jx<ny-1; jx++) {
	int index = index_base + jx ;
	if ( index < index_min ) continue;
	if ( index > index_max ) break;
	index -= index_min; //convert to local index
	U[index] = (lambda / (lambda+1.0)) *
     	           sqrt(MIN(hy * MIN(jx, ny-jx-1), hx * MIN(ix,nx-ix-1)));
      }
    }
  } //initialize U

  //local part of Jacobian
  Matrix AAA(local_nrows, msize);
  
  { //fill matrix
    InitMatrix AA(local_nrows, msize);
    int index_min = pos * Manager::rank();
    int index_max = index_min + local_nrows - 1;
    
    for (int i=1; i<nx-1; i++) {
      int index_base = i * ny;
      if ( index_base > index_max ) break;
      
      for (int j=1; j<ny-1; j++) {
	int index = index_base + j;
	
	if ( index < index_min ) continue;
	if ( index > index_max ) break;

	int row   = index - index_min;
	
	AA(row, index) = 4.0 - lamhxhy * exp(U[row]);
	
	AA(row, index - ny) = -1.0;
	AA(row, index + ny) = -1.0;
	AA(row, index - 1)  = -1.0;
	AA(row, index + 1)  = -1.0;
      }
    }

    //j=0
    for (int i=0; i<nx; i++) {
      int index = i * ny;
      if ( index < index_min ) continue;
      if ( index > index_max ) break;
      int row = index - index_min;
      AA(row, index) = 1.0;
    }
    //j=ny-1;
    for (int i=0; i<nx; i++) {
      int index = i * ny + ny - 1;
      if ( index < index_min ) continue;
      if ( index > index_max ) break;
      int row = index - index_min;
      AA(row, index) = 1.0;
    }
    //i=0
    for (int j=0; j<ny; j++) {
      int index = j;
      if ( index < index_min ) continue;
      if ( index > index_max ) break;
      int row = index - index_min;
      AA(row, index) = 1.0;
    }
    //i=nx-1
    for (int j=0; j<ny; j++) {
      int index = (nx-1) * ny + j;
      if ( index < index_min ) continue;
      if ( index > index_max ) break;
      int row = index - index_min;
      AA(row, index) = 1.0;
    }

    mtl::copy(AA, AAA);
  }

  Vector F2(local_nrows);

  parallel_matrix<Matrix> A(AAA);

  apply_op<parallel_matrix<Matrix> > f(A, pos * Manager::rank(), nx, ny);

  f(U, F);
  double norm0 = itl::two_norm(F);
  double two_norm_F = norm0, old_two_norm_F = 0.0;

  if ( Manager::rank() == 0 ) {
    cout << "norm0 = " << norm0 << endl;
  }

  //classical_gram_schmidt<Vector> KS(restart, size(U));
  modified_gram_schmidt<Vector> KS(restart, size(U));
 
  // Start Newton solver
  int k;
  for (k = 0; k < kmax; k++) {

    //update Jacobian here. 

    /*Jacobian is close to -Laplacian except two cases:
     * 1. diagonal part:   4.0 - lamhxhy*exp(U[i])
     * 2. rows corresponding to boundary points [ 0 ... 1 ... 0 ]
     */
    {
      int index_min = pos*Manager::rank();
      int i, j;

      for (j=local_ny_begin; j<local_ny_end; ++j) {
	int index_base = j * nx;
	if (j == 0 || j == ny-1)
	  continue;
	
	for (i=local_nx_begin; i<local_nx_end; ++i) {
	  
	  int index = index_base + i;
	  int row = index - index_min;
	  
	  if (i == 0 || i == nx-1)
	    continue;

	  AAA(row, index) = 4.0 - lamhxhy*exp(U[row]);
	}
      }
    }

    double r = two_norm_F;
    identity_preconditioner p;
    block_ilu<Matrix> bjacobi;
    if ( is_precond ) {
      bjacobi(AAA, Manager::rank()*pos);
      solve(bjacobi(), F, F2);
      r = itl::two_norm(F2);
    }

    noisy_iteration<double> iter(r, iter_max, ksp_rtol, ksp_atol);
    basic_iteration<double>& bit=dynamic_cast<basic_iteration<double>&>(iter);

    //  Solve J dx = f  with restarted GMRES (identity precond)

    //initialize DX
    for (int i=0; i<DX.size(); i++)
      DX[i] = 0.0;

    if (ksp_type == "gmres") {
      if ( is_precond ) { 
	if ( ksp_monitor )
	  gmres(A, DX, F, bjacobi(), restart, iter, KS);
	else
	  gmres(A, DX, F, bjacobi(), restart, bit, KS);
	
      } else {
	if ( ksp_monitor )
	  gmres(A, DX, F, p(), restart, iter, KS);
	else
	  gmres(A, DX, F, p(), restart, bit, KS);
      }      
    } else if ( ksp_type == "bicgstab" ) {
      if ( is_precond ) { 
	if ( ksp_monitor )
	  bicgstab(A, DX, F, bjacobi(), iter);
	else
	  bicgstab(A, DX, F, bjacobi(), bit);
	
      } else {
	if ( ksp_monitor )
	  bicgstab(A, DX, F, p(), iter);
	else
	  bicgstab(A, DX, F, p(), bit);
      }      
    } else if ( ksp_type == "cgs" ) {
      if ( is_precond ) { 
	if ( ksp_monitor )
	  cgs(A, DX, F, bjacobi(), iter);
	else
	  cgs(A, DX, F, bjacobi(), bit);
	
      } else {
	if ( ksp_monitor )
	  cgs(A, DX, F, p(), iter);
	else
	  cgs(A, DX, F, p(), bit);
      }      
    } else if ( ksp_type == "tfqmr" ) {
      if ( is_precond ) { 
	if ( ksp_monitor )
	  tfqmr(A, DX, F, bjacobi.left(), bjacobi.right(), iter);
	else
	  tfqmr(A, DX, F, bjacobi.left(), bjacobi.right(), bit);
	
      } else {
	if ( ksp_monitor )
	  tfqmr(A, DX, F, p(), p(), iter);
	else
	  tfqmr(A, DX, F, p(), p(), bit);
      }      
    } else {
      std::cerr << "No suitable iterative method." << std::endl;
      return -1;
    }

    itl::add(itl::scaled(DX, -1.0), U);
    //  Compute F
    f(U, F);

    old_two_norm_F = two_norm_F;
    two_norm_F     = itl::two_norm(F);

    if ( Manager::rank() == 0 ) {
      cout << "Number of iteration used in GMRES: " << bit.iterations() 
	   << "     Residual: " << bit.resid() << endl;

      cout << "Newton iteration:                  " << k 
	   << "     |F| = " << two_norm_F << endl;
    }

    if ( two_norm_F < newton_rtol * norm0 
	 || std::abs( old_two_norm_F - two_norm_F ) < newton_stol
	 || two_norm_F < newton_atol )
      break;
  }

  if ( Manager::rank() == 0 )
    cout << "Number of Newton iteration: " << k+1 << endl;

  A.cleanup(); //to free MPI request and MPI datatype

  Manager::finish();
  return 0;
}
