!==============================================================================
!
! Routines:
!
! (1) epsilon           Originally by (MSH)      Last Edited 5/1/2008 (JRD)
!
!     Send comments/bugs to jdeslip@berkeley.edu
!
!     See README file for more details
!
!==============================================================================

#include "f_defs.h"

program epsilon

  use global_m
  use blas_m
  use chi_summation_m
  use chi_convergence_m
  use fftw_m
  use fullbz_m
  use genwf_eps_m
  use gmap_m
  use input_m
  use input_q_m
  use input_utils_m
  use irrbz_m
  use mtxel_m
  use mtxelmultiply_m
  use read_matrix_m
  use scalapack_m
  use sort_m
  use vcoul_generator_m
  use write_matrix_m
  use io_utils_m
  use epswrite_hdf5_m

#ifdef HDF5
  use hdf5
#endif
  
  implicit none

  type (kpoints) :: kp
  type (kpoints) :: kpq
  type (symmetry) :: syms
  type (gspace) :: gvec
  type (crystal) :: crys
  type (polarizability) :: pol
  type (valence_wfns) :: vwfn
  type (conduction_wfns) :: cwfn
  type (scalapack) :: scal
  type (int_wavefunction) :: intwfnv
  type (int_wavefunction) :: intwfnvq
  type (int_wavefunction) :: intwfnc
  type(chi_summator_t) :: chi_summator
  type(chi_converger_t) :: chi_converger
  type(wfn_FFT_comm_t) :: wfn_FFT_comm

!-----------------------
! Arrays for kpoints (fullbz, ...etc)

  integer :: nrk
  integer :: indst(48)
  integer, allocatable :: indrk(:),neq(:)
  type(grid) :: gr

!-----------------------
! Arrays for polarizability matrix

  integer :: nstar
  logical :: is_q0, use_WFNq, qpt_done
  integer :: npcmax,nprmax,ivin,neqmax
  integer, allocatable :: ind(:),indt(:,:,:)
  integer, allocatable :: nprdtemp(:),npcdtemp(:)
  integer, allocatable :: imyrowdtemp(:,:),imycoldtemp(:,:)
  real(DP) :: qq(3) !< The current q-pt under consideration
  real(DP) :: omega_plasma
  real(DP), allocatable :: ekin(:)
  SCALAR, allocatable :: ph(:)
  SCALAR, allocatable :: pht(:,:,:)
  integer, allocatable :: nst(:)

!-----------------------
! Variables used with HDF5

#ifdef HDF5
  integer :: hdf5_error
  integer :: my_iq
  integer, allocatable :: isorti(:)
  character(len=13) :: filename_chi_hdf5, filename_eps_hdf5, filename_out_hdf5
#endif

  character :: tmpstr*100
  character :: filename*13
  integer :: initial_access = 0
  integer :: i,j,k,n,irk,iv,itran,it
  integer :: ncount,ix,jx,kgq(3)
  integer :: np,iunit,iq,icurr
  integer :: ik, ij
  integer :: nmtx_t
  real(DP) :: tsec(2)
  real(DP) :: fact,rk(3)
  integer :: ispin,icol,irow,icolm,irowm
  character*24 :: routnam(120)
  integer :: routsrt(50)
  integer :: iunit_v, iunit_c
  real(DP) :: dnpcr,dnpcrmax

  integer :: jj

  type(progress_info) :: prog_info !< a user-friendly progress report

!--------------- Begin Program ---------------------------------------

  call peinfo_init()
  
!----------------------
! Initialize random numbers

  peinf%jobtypeeval = 0

!--------------------
! Initialize timer

  if(peinf%inode .eq. 0) then
    call timacc(0,0)
    call timacc(1,1)
  endif

!------------------------
! Initialize files

  if(peinf%inode .eq. 0) call timacc(19,1)
  
  call open_file(55,file='epsilon.inp',form='formatted',status='old')
  if(peinf%inode .eq. 0) then
    call open_file(7,file='epsilon.log',form='formatted',status='replace')
  endif
  
  call write_program_header('Epsilon', .true.)

!----------- Call Input: Read crystal data from unit 25 ---------------

! read parameters and q-points from unit 55 (input file)
! initialize unit 10 (output for polarizability matrix)

  if(peinf%inode .eq. 0) call timacc(19,2)
  
  if(peinf%inode .eq. 0) call timacc(2,1)
  call input(kp,crys,syms,gvec,pol,cwfn,vwfn,intwfnv,intwfnc,omega_plasma,gr)
  pol%FFTgrid = gvec%FFTgrid
  if (pol%min_fftgrid) then
    ! FHJ: Figure our the FFT grid that holds the WFNs
    call get_wfn_fftgrid(pol, gvec, kp, intwfnv)
  endif

  if(.not. pol%skip_chi .and. peinf%inode == 0) then
    call open_file(17,file='chi_converge.dat',form='formatted',status='replace')
  endif

#ifdef CPLX
  if(peinf%inode == 0 .and. pol%freq_dep == 2) &
    call open_file(83,file='polchiA.log',form='formatted',status='replace')
#endif

  if (pol%iwritecoul .eq. 1) then
    if (peinf%inode .eq. 0) then
      call open_file(19,file='vcoul',form='formatted',status='replace')
    endif
  endif

! CHP: saves the (G=0,G`=0) component of (retarded) epsilon inverse
  if(peinf%inode .eq. 0 .and. pol%freq_dep .eq. 2) then
    call open_file(51,file='EpsInvDyn',form='formatted',status='replace')
    call open_file(52,file='EpsDyn',form='formatted',status='replace')
  endif

  SAFE_ALLOCATE(vwfn%isort, (gvec%ng))
  SAFE_ALLOCATE(cwfn%isort, (gvec%ng))

  if (pol%nq0>0) then
    ! FHJ: no q->0 point can have all coordinates set to zero
    if (pol%icutv==0.and.any(all(abs(pol%qpt(:,:pol%nq0))<TOL_ZERO,dim=1))) then
      call die('No truncation and zero q0', only_root_writes=.true.)
    endif
  endif
  
  if(peinf%inode .eq. 0) call timacc(2,2)


!-------------- Read wavefunctions for (k+q) points ---------------------

! SIB:  The input_q routine looks the same as the input routine but
! if reads from a file called WFNq instead of WFN.  Presumably
! these are the shifted (by "q") wave functions.

  if(peinf%inode .eq. 0) call timacc(3,1)
  if (pol%need_WFNq) then
    if (peinf%inode .eq. 0) then
      write(6,*) 'You have a slightly shifted q0 vector and a semiconductor.'
      write(6,*) 'So reading from WFNq.'
    endif
    call input_q(gvec,kpq,cwfn,vwfn,pol,intwfnvq)
  elseif (pol%lin_denominator>TOL_Zero) then
  endif
  if(peinf%inode .eq. 0) call timacc(3,2)


!-------------- GENERATE FULL BZ ----------------------------------------

! SIB:  fullbz() takes the kpoints in kp%components(1:3,kp%nrk) and applies all
! the symmetries in syms to them.  The resulting set of unique vectors
! are in gr%f(1:3,gr%nf) (gr%nf of them).

  if (peinf%inode .eq. 0) then
#ifdef VERBOSE
    write(6,*)
    write(6,*) 'Calling fullbz to generate full zone from'
    write(6,*) 'syms%ntran = ',syms%ntran
    write(6,*) 'kp%nrk = ',kp%nrk
    do iq=1,kp%nrk
      write(6,'(3f10.4)') kp%rk(:,iq)
    enddo
    write(6,*)
#endif
    write(6,*) 'nfk=',gr%nf
#ifdef VERBOSE
    do iq=1,gr%nf
      write(6,'(3f10.4)') gr%f(:,iq)
    enddo
#endif
  endif

  SAFE_ALLOCATE(ekin, (gvec%ng))  
  SAFE_ALLOCATE(scal%nprd, (peinf%npes_freqgrp))
  SAFE_ALLOCATE(scal%npcd, (peinf%npes_freqgrp))

  if (pol%os_opt_ffts==2) then
    ! FHJ: FFTs opt. level 2 => do all the FFTs using all the processor, int_wfn arrays
    call genwf_FFT_Isend(wfn_FFT_comm,crys,gvec,syms,kp,kpq,vwfn,pol,cwfn,intwfnv,intwfnvq,intwfnc)
    !call genwf_FFT(crys,gvec,syms,kp,kpq,vwfn,pol,cwfn,intwfnv,intwfnvq,intwfnc,need_WFNq)
  endif

!----------- LOOP over q points for which chi and eps are calculated -----
  do iq=1,pol%nq

    if (pol%stop_after_qpt>-1 .and. iq>pol%stop_after_qpt) then
      if (peinf%inode==0) write(6,'(/,1x,a,/)') 'stop_after_qpt: emulating a sudden application stop.'
      FLUSH(6)
      FLUSH(0)
#ifdef MPI
      call MPI_Barrier(MPI_COMM_WORLD, mpierr)
      call MPI_Finalize(mpierr)
#endif
      MYSLEEP(1)
      stop
    endif

    ! SIB: qq(1:3) is the current q vector under consideration
    qq(:)=pol%qpt(:,iq)
    if(peinf%inode.eq.0) then
      call print_dealing_with(iq, pol%nq, qq, 'q')
    endif
    is_q0 = iq<=pol%nq0
    use_WFNq = (is_q0.and.pol%need_WFNq).or.pol%patched_sampling
    if (peinf%inode==0) then
      if (is_q0) then
        write(6,'(1x,a)') 'This is the special q->0 point.'
      else
        write(6,'(1x,a)') 'This is a regular non-zero q-point.'
      endif
    endif

#ifdef HDF5
    my_iq = iq
    if (.not.is_q0) my_iq = iq - pol%nq0
    if (is_q0) then
      filename_eps_hdf5 = 'eps0mat.h5'
      filename_chi_hdf5 = 'chi0mat.h5'
    else
      filename_eps_hdf5 = 'epsmat.h5'
      filename_chi_hdf5 = 'chimat.h5'
    endif
    if (pol%skip_epsilon) then
      filename_out_hdf5 = filename_chi_hdf5 ! Write to/restart chimat file
    else
      filename_out_hdf5 = filename_eps_hdf5 ! Write to/restart epsmat file
    endif
    if (pol%use_hdf5.and.pol%restart) then
      if (peinf%inode==0) qpt_done = is_qpt_done(TRUNC(filename_out_hdf5), my_iq)
#ifdef MPI
      call MPI_BCAST(qpt_done, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, mpierr)
#endif
      if (qpt_done) then
        if (peinf%inode==0) then
          write(6,'(/,1x,a,/)') 'This q-point was already calculated: skipping.'
        endif
        cycle
      endif
    endif
#endif

    if(peinf%inode.eq.0) call timacc(20,1)

!--------------------
! Determine number of matrix elements
!
! Calculate kinetic energies |q+G|^2

    if (is_q0) then
      call kinetic_energies(gvec, crys%bdot, ekin)
    else
      call kinetic_energies(gvec, crys%bdot, ekin, qvec = qq)
    endif

!--------------------
! Sort kinetic energies
! index of ordered kinetic energies in array isrtx
!
! SIB: pol%isrtx has the indices for sorted ekin

    SAFE_ALLOCATE(pol%isrtx, (gvec%ng))

    if(peinf%inode.eq.0) call timacc(20,2)
    
    if(peinf%inode.eq.0) call timacc(5,1)
#ifdef VERBOSE
    call logit('sorting gvec')
#endif
    call sortrx(gvec%ng, ekin, pol%isrtx, gvec = gvec%components)
    if(peinf%inode.eq.0) call timacc(5,2)

!---------------------
! Compute inverse array to isrtx

    SAFE_ALLOCATE(pol%isrtxi, (gvec%ng))
    do i=1,gvec%ng
      pol%isrtxi(pol%isrtx(i))=i
    enddo

! SIB:  pol%nmtx becomes the number of matrix elements to be computed;
! the matrix is computed if its ekin is < ecuts

    if(peinf%inode.eq.0) call timacc(21,1)

    pol%nmtx = gcutoff(gvec%ng, ekin, pol%isrtx, pol%ecuts)
    if(peinf%inode.eq.0) then
      write(6, '(1x,a,i0)') 'Rank of the polarizability matrix (nmtx): ', pol%nmtx
    endif
    ! FHJ: Do we want to use the economical fftgrid/box? If so, we pad the WFN FFTbox
    ! by the box that holds nmtx gvectors, which is much smaller than the WFN fftbox.
    if (pol%min_fftgrid.and.pol%os_opt_ffts<2) call get_eps_fftgrid(pol, gvec)
    
    if(peinf%inode.eq.0) call timacc(21,2)
    if(peinf%inode.eq.0) call timacc(22,1)

    SAFE_ALLOCATE(pol%irow, (pol%nmtx))
    pol%irow=0

! JRD:  Determine size of distributed matrices


    if(pol%os_para_freqs .gt. 1) then
    else
      call blacs_setup(scal, pol%nmtx, .true.)
    endif
#ifdef USESCALAPACK
#ifdef VERBOSE
    call logit('Initializing scaLAPACK')
    if (peinf%inode .eq. 0) then
      write(6,*) ' '
    endif
#endif
#endif

#ifdef MPI
    SAFE_ALLOCATE(nprdtemp, (peinf%npes_freqgrp))
    SAFE_ALLOCATE(npcdtemp, (peinf%npes_freqgrp))
    
    scal%nprd = 0
    scal%npcd = 0
    nprdtemp = 0
    npcdtemp = 0

    if(pol%os_para_freqs .gt. 1) then
    else
      nprdtemp(peinf%inode + 1) = scal%npr
      npcdtemp(peinf%inode + 1) = scal%npc
      call MPI_ALLREDUCE(nprdtemp,scal%nprd,peinf%npes_freqgrp,MPI_INTEGER,MPI_SUM, MPI_COMM_WORLD,mpierr)
      call MPI_ALLREDUCE(npcdtemp,scal%npcd,peinf%npes_freqgrp,MPI_INTEGER,MPI_SUM, MPI_COMM_WORLD,mpierr)
    endif
    SAFE_DEALLOCATE(nprdtemp)
    SAFE_DEALLOCATE(npcdtemp)
#else
    scal%nprd = scal%npr
    scal%npcd = scal%npc
#endif

! DWV: Get the maximum number of columns/rows owned by one of the processors. This is so you can
! allocate arrays of the right size. For what determines how many rows and columns a procesor
! has, see blacs_setup routine in Common directory and google the numroc routine of scalapack
! Numroc is a nifty little routine

    dnpcr = scal%npc*1D0*scal%npr

#ifdef MPI
    call MPI_ALLREDUCE(scal%npc,npcmax,1,MPI_INTEGER,MPI_MAX,MPI_COMM_WORLD,mpierr)
    call MPI_ALLREDUCE(scal%npr,nprmax,1,MPI_INTEGER,MPI_MAX,MPI_COMM_WORLD,mpierr)
    call MPI_ALLREDUCE(dnpcr,dnpcrmax,1,MPI_DOUBLE_PRECISION,MPI_MAX,MPI_COMM_WORLD,mpierr)
#else
    npcmax = scal%npc
    nprmax = scal%npr
#endif

    if (dnpcr .gt. (pol%nmtx*1.5D0*pol%nmtx/(1D0*peinf%npes))) then
      if (peinf%inode .eq. 0) write(0,'(a)') ' WARNING: ScaLAPACK layout is not well load-balanced.'
      if (peinf%inode .eq. 0) write(0,'(a,f12.0)') ' Max number of elements owned by one task is:', dnpcr
    endif

! Allocate scalapack arrays needed later. See scalapack in Common/scalapack.f90 to see what
! these arrays hold

    SAFE_ALLOCATE(scal%isrtxcol, (scal%npc))
    SAFE_ALLOCATE(scal%isrtxrow, (scal%npr))
    SAFE_ALLOCATE(scal%imycol, (scal%npc))
    SAFE_ALLOCATE(scal%imyrow, (scal%npr))
    SAFE_ALLOCATE(scal%imycolinv, (pol%nmtx))
    SAFE_ALLOCATE(scal%imyrowinv, (pol%nmtx))
    SAFE_ALLOCATE(scal%imycold, (npcmax,peinf%npes))
    SAFE_ALLOCATE(scal%imyrowd, (nprmax,peinf%npes))
    SAFE_ALLOCATE(imycoldtemp, (npcmax,peinf%npes))
    SAFE_ALLOCATE(imyrowdtemp, (nprmax,peinf%npes))
    scal%imycold = 0
    scal%imyrowd = 0
    imycoldtemp = 0
    imyrowdtemp = 0

    icurr=0
    
    do ij = 1, pol%nmtx
      irow=MOD(INT(((ij-1)/scal%nbl)+TOL_SMALL),scal%nprow)
      if (irow .eq. scal%myprow) then
        do ik = 1, pol%nmtx
          icol=MOD(INT(((ik-1)/scal%nbl)+TOL_SMALL),scal%npcol)
          if(icol .eq. scal%mypcol) then
            icurr=icurr+1
            irowm=INT((icurr-1)/scal%npc+TOL_SMALL)+1
            icolm=MOD((icurr-1),scal%npc)+1
            scal%isrtxrow(irowm)=pol%isrtx(ij)
            scal%isrtxcol(icolm)=pol%isrtx(ik)
            scal%imyrow(irowm)=ij
            scal%imycol(icolm)=ik
            scal%imyrowinv(ij)=irowm
            scal%imycolinv(ik)=icolm
            imyrowdtemp(irowm,peinf%inode+1)=ij
            imycoldtemp(icolm,peinf%inode+1)=ik
          endif
        enddo
      endif
    enddo

#ifdef MPI
    call MPI_ALLREDUCE(imyrowdtemp(1,1),scal%imyrowd(1,1),nprmax*peinf%npes,MPI_INTEGER,MPI_SUM,MPI_COMM_WORLD,mpierr)
    call MPI_ALLREDUCE(imycoldtemp(1,1),scal%imycold(1,1),npcmax*peinf%npes,MPI_INTEGER,MPI_SUM,MPI_COMM_WORLD,mpierr)
#else
    scal%imyrowd = imyrowdtemp
    scal%imycold = imycoldtemp
#endif

    SAFE_DEALLOCATE(imycoldtemp)
    SAFE_DEALLOCATE(imyrowdtemp)

    if(peinf%inode.eq.0) call timacc(22,2)

    if(peinf%inode.eq.0) call timacc(23,1)

!----------------------
! Determine subgroup which leaves qq invariant
!
! SIB:  figures out which symmetries acting on qq result in qq + integer
! entries.  syms%ntranq is their number, syms%indsub are their indices
! (pointing to syms%mtrx), and syms%kgzero(1:3,:) are the integer entries.

    if(peinf%inode.eq.0) call timacc(6,1)
    call subgrp(qq,syms)
    if (pol%patched_sampling) then
      syms%ntranq = 1
    endif
    if(peinf%inode.eq.0) call timacc(6,2)

!-----------------------
! Determine independent elements of polarizability matrix
!
! SIB:  independent means due to symmetries.  This initializes
! the polarizability matrix pol%chi to zero (for independent entries)
! and figure out phases due to symmetries for dependent ones,
! and points dependent ones to the entries they depend on (pol%kxi indices)

! JRD: we don`t do this anymore

!        if(peinf%inode.eq.0) call timacc(7,1)
!#ifdef VERBOSE
!        call logit('calling indep')
!#endif
!        call indep(nind,gvec,syms,pol,kp%nspin)
!
! JRD: Testing what if we set pol%kxi to zero
!        pol%kxi = 0
!        pol%chi = 0D0
!        nind=pol%nmtx*(pol%nmtx+1)/2
!
!        if(peinf%inode.eq.0) call timacc(7,2)

!----------------------
! Reduce the k-points to the irr. bz with respect to q
!
! SIB:  figure out k-points in irr. BZ (based on symmetries for current q)
! nrk is # of irr. points, indrk are their indices in the full zone,
! and neq is the number of equiv. points for an irr. point.
! (Full zone vectors come in through gr%f(1:3,1:gr%nf).)

    if(peinf%inode.eq.0) call timacc(8,1)
    SAFE_ALLOCATE(indrk, (gr%nf))
    SAFE_ALLOCATE(neq, (gr%nf))
    call irrbz(syms,gr%nf,gr%f,nrk,neq,indrk)
    if(peinf%inode.eq.0) call timacc(8,2)

    neqmax = maxval(neq(1:nrk))

!        write(6,*) peinf%inode, 'neqmax', neq(1), neqmax

!---------------------------
! Output points in irr. bz

    if(peinf%inode.eq.0) then
      write(6,'(1x,a,i0)') 'Number of k-points in the irreducible BZ(q) (nrk): ', nrk
#ifdef VERBOSE
      do j=1,nrk
        write(6,60) (gr%f(i,indrk(j)),i=1,3),neq(j)
60      format(3f10.5,i5)
      enddo
#endif
      write(7,70) (qq(i),i=1,3),pol%nmtx,nrk
70    format(/ /,5x,'q=',3f7.4,2x,'nmtx=',i8,2x,'nrk=',i3)
    endif

    if (pol%patched_sampling) then
      fact=4.0d0/(product(kp%kgrid)*crys%celvol*kp%nspin*kp%nspinor)
    else
      fact=4.0d0/(dble(gr%nf)*crys%celvol*kp%nspin*kp%nspinor)
    endif

    if (pol%freq_dep .eq. 0) then
      SAFE_ALLOCATE(pol%chi, (scal%npr,scal%npc,kp%nspin))
      pol%chi=ZERO
    endif

    if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0)) then
      SAFE_ALLOCATE(pol%chiRDyn, (pol%os_nfreq_para,scal%npr,scal%npc,kp%nspin))
      pol%chiRDyn=(0.0,0.0)
#ifdef CPLX
      SAFE_ALLOCATE(pol%chiADyn, (pol%os_nfreq_para,scal%npr,scal%npc,kp%nspin))
      pol%chiADyn=(0.0,0.0)
#endif
    endif

    if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1)) then
      SAFE_ALLOCATE(pol%chiRDyn, (pol%os_nfreq_para,scal%npr,scal%npc,kp%nspin))
      pol%chiRDyn=(0.0,0.0)
#ifdef CPLX
      SAFE_ALLOCATE(pol%chiADyn, (pol%os_nfreq_para,scal%npr,scal%npc,kp%nspin))
      pol%chiADyn=(0.0,0.0)
#endif
      SAFE_ALLOCATE(pol%chiTDyn, (pol%os_nsfreq_para,scal%npr,scal%npc,kp%nspin))
      pol%chiTDyn=(0.0,0.0)
    endif

    if (pol%freq_dep .eq. 3) then
      SAFE_ALLOCATE(pol%chiRDyn, (pol%os_nfreq_para,scal%npr,scal%npc,kp%nspin))
      pol%chiRDyn=(0.0,0.0)
    endif

    if (.not. pol%skip_chi) then

!------------------------------------
! SIB:  allocate space

!        write(6,*) peinf%inode, 'allocating pht', neqmax, pol%nmtx, nrk

      SAFE_ALLOCATE(ind, (pol%nmtx))
      SAFE_ALLOCATE(ph, (pol%nmtx))
      SAFE_ALLOCATE(pht, (pol%nmtx,neqmax,nrk))
      SAFE_ALLOCATE(indt, (pol%nmtx,neqmax,nrk))
      ind=0

      SAFE_ALLOCATE(nst, (nrk))

! JRD: Possible Memory Hazard.  We can speed this up by possibly
! only allocating number of bands on current proc and doing send/recvs
      if(pol%os_para_freqs .gt. 1) then      
      else
        SAFE_ALLOCATE(pol%gme, (pol%nmtx,peinf%ncownactual,peinf%nvownactual,kp%nspin,nrk,pol%os_para_freqs))
      endif
    
      if (pol%freq_dep .eq. 2 .or. pol%freq_dep .eq. 3) then
        if(pol%os_para_freqs .eq. 1) then      
          SAFE_ALLOCATE(pol%edenDyn, (peinf%nvownactual,peinf%ncownactual,kp%nspin,nrk,pol%os_para_freqs))
        else
        endif
      endif

      if(peinf%inode.eq.0) call timacc(23,2)

!--------- LOOP OVER K-POINTS IN SET RK ---------------------------------
      ! FHJ: this is to generate nice output / time estimate
      call progress_init(prog_info, 'calculation of matrix elements', 'transition', nrk)

! SIB:  loop over points in irreducible zone
      do irk=1,nrk
        ! FHJ : friendly output / running time estimate
        call progress_step(prog_info, irk)
        
        if (pol%freq_dep .eq. 0) then
          SAFE_ALLOCATE(pol%eden, (vwfn%nband+pol%ncrit,cwfn%nband,kp%nspin))
          
        endif
        
        rk(:)=gr%f(:,indrk(irk)) ! rk(:) is the current irr. k-point

! Regenerate star of rk,store the index of the rotation
! SIB:  Star is the set of vectors generated by applying all
! subgroup operations for the current q-vector to the k-point rk.

        if(peinf%inode.eq.0) call timacc(11,1)
        call rqstar(syms,nstar,indst,rk)
        if(nstar.ne.neq(irk)) then
          write(0,*) 'nstar?',irk,nstar,neq(irk)
          call die('nstar mismatch')
        endif
        if(peinf%inode.eq.0) call timacc(11,2)

! JRD: loop over transfs which generate the star of rk for gmap

        nst(irk) = nstar
        if(peinf%inode.eq.0) call timacc(12,1)
        do it=1,nstar

! Map g-vectors in polarizability to r**(-1)(g-gq)
! note that gmap requires index of transf in full group
! whereas indst gives index in subgroup

          itran = syms%indsub(indst(it))
          kgq(:) = -syms%kgzero(:,indst(it)) ! note minus sign
          call gmap(gvec,syms,pol%nmtx,itran,kgq,pol%isrtx,pol%isrtxi,ind,ph,.true.)

          pht(:,it,irk) = ph(:)
          indt(:,it,irk) = ind(:)

! debug Statement here
        enddo
        if(peinf%inode.eq.0) call timacc(12,2)


!--------- loop over occupied states -------------------------------------

! SIB:  loop over valence states (iv,ispin) where iv is the band index.

        if (pol%os_opt_ffts==2) then
          if (.not.wfn_FFT_comm%done) call genwf_FFT_Wait(wfn_FFT_comm)
          !FHJ: TODO free me later!
          call genwf_lvl2(kp,kpq,vwfn,pol,cwfn)
        endif
        pol%gme(:,:,:,:,irk,:)=0D0
        do iv=1,peinf%nvownmax
#ifdef VERBOSE
          if (peinf%inode .eq. 0) write(6,*) 'Doing iv', iv,'of', peinf%nvownmax
#endif

          if (pol%os_opt_ffts/=2) then
            call genwf_gen(syms,gvec,crys,kp,kpq,irk,rk,qq,vwfn,pol,cwfn,use_WFNq,intwfnv,intwfnvq,intwfnc,iv)

            if (pol%os_opt_ffts==1) then
              ! FHJ: FFT opt. level 1: precalculates FFTs of the conduction bands
              !      each kpt at a time.
              if (iv==1) then
                call mtxel_init_FFT_cond(gvec,pol,cwfn,kp)
              endif
            endif
          endif

! SIB:  compute matrix elements and energy denominators for (iv,ispin)
! with all other conduction bands.

          do ispin=1,kp%nspin

#ifdef VERBOSE
            write(tmpstr,'(a,i2,a,i4,a)') "is =", ispin, " iv = ", iv, " calling mtxel" 
            call logit(tmpstr)
#endif

            if ( iv .le. peinf%nvownactual) then
              if(peinf%inode.eq.0) call timacc(10,1)
              ivin=peinf%invindexv(iv)

              call mtxel(ivin,gvec,vwfn,cwfn,pol,ispin,irk,kp,kpq,peinf%rank_mtxel)

              if(peinf%inode.eq.0) call timacc(10,2)
            endif

          enddo ! ispin

          if (pol%os_opt_ffts<2) then
            ! FHJ: opt. lvl 2 doesn`t even own the WFNs..
            if ( iv .le. peinf%nvownactual) then
              SAFE_DEALLOCATE_P(vwfn%zv)
            endif
          endif

        enddo ! iv

        if (peinf%nvownactual>0) then
          if (pol%os_opt_ffts<2) then
            SAFE_DEALLOCATE_P(cwfn%zc)
          endif
          if (pol%os_opt_ffts==1) then
            ! FHJ: destroy FFTs of conduction bands
            call mtxel_free_FFT_cond(cwfn)
          endif

          SAFE_DEALLOCATE_P(vwfn%ev)
          SAFE_DEALLOCATE_P(cwfn%ec)
        endif

        if (pol%freq_dep .eq. 0) then
          SAFE_DEALLOCATE_P(pol%eden)
        endif
        
      enddo ! irk
      call progress_free(prog_info)

!------------------------------------------------------------------
! DWV: if requested, test convergence of chi with conduction bands

      if (peinf%inode .eq. 0 .and. pol%freq_dep .eq. 0 .and. pol%fullConvLog .ne. -1) then
        write(6,*) 'Allocating Convergence Arrays of Dim', cwfn%nband-vwfn%nband
      endif

      if(peinf%inode.eq.0) call timacc(24,1)

      if (pol%freq_dep .eq. 0 .and. pol%fullConvLog .ne. -1) then

        call create_chi_converger(chi_converger,vwfn%nband,cwfn%nband)

#ifdef VERBOSE
        if (peinf%inode .eq. 0) write(6,'(/,1x,"Starting Convergence Tests")')
#endif
        call chi_convergence_test(pol,pht,indt,kp,nrk,nst,vwfn%nband,cwfn%nband,fact,chi_converger)
 
        if(peinf%inode .eq. 0) then
          call chi_convergence_print(pol,iq,vwfn%nband,cwfn%nband,chi_converger)
        endif

        call free_chi_converger(chi_converger)

!        write(6,*) 'End Convergence Writing'

      endif ! pol%freq_dep .eq. 0

      if(peinf%inode.eq.0) call timacc(24,2)


!-----------------------------------------------------------------------
! Construct part of chi that this proc owns by summing the pol%gme matrices

#ifdef VERBOSE
      if (peinf%inode .eq. 0) write(6,'(/,1x,"Doing chi Summation")')
#endif
      if (peinf%inode.eq.0) call timacc(15,1)

      call create_chi_summator(chi_summator, pol, fact)

      if (pol%gcomm .eq. 0) then
        call chi_summation_comm_elements(chi_summator,&
                                     pol,scal,kp,vwfn,cwfn,&
                                     nst,nrk,indt,pht)
      else
        call chi_summation_comm_matrix(chi_summator,&
                                       pol,scal,kp,kpq,vwfn,&
                                       nst,nrk,indt,pht)
      endif

      call free_chi_summator(chi_summator, pol)
          
      if(peinf%inode.eq.0) call timacc(15,2)
#ifdef VERBOSE 
      if (peinf%inode .eq. 0) write(6,'(1x,a)') "Done Polarizability"
#endif


! Done ChiSum
!-----------------------------------------------------------------------
! Deallocate some arrays no longer needed

      SAFE_DEALLOCATE(pht)
      SAFE_DEALLOCATE(indt)
      SAFE_DEALLOCATE(ind)
      SAFE_DEALLOCATE(ph)
      SAFE_DEALLOCATE(nst)
      SAFE_DEALLOCATE_P(pol%gme)

      if (pol%freq_dep .eq. 2 .or. pol%freq_dep .eq. 3) then
        SAFE_DEALLOCATE_P(pol%edenDyn)
      endif

    else ! pol%skip_chi
!DWV: read chi from chi if this is specified

#ifdef HDF5
      if (pol%use_hdf5) then
        if (peinf%inode .eq. 0) then
          ! FHJ: FIXME: consistency checks...
        endif
        ! FHJ: TODO - write diagonal elements (I actually think this is useless)
        if (pol%freq_dep .eq. 0) then
          do ispin=1,kp%nspin
            call read_matrix_d_hdf5(scal, pol%chi(:,:,ispin), pol%nmtx, TRUNC(filename_chi_hdf5), my_iq, ispin)
          enddo
        else
          do ispin=1,kp%nspin
            call read_matrix_f_hdf5(scal, pol%nFreq, pol%os_nfreq_para, pol%chiRDyn(:,:,:,ispin), &
              pol%chiADyn(:,:,:,ispin), pol%nmtx, pol%os_para_freqs, TRUNC(filename_chi_hdf5), my_iq, ispin)
          enddo
        endif
      else
#endif

      if (is_q0) then
        
        iunit=10
        if(peinf%inode.eq.0) then
          write(6,'(1x,a)') 'Reading from file chi0mat.'
          call open_file(unit=10,file='chi0mat',form='unformatted',status='old')
          read(10)
          read(10)
          read(10)
          read(10)
          read(10)
          read(10)
          read(10)
          read(10)
          read(10)
          read(10)
          read(10) 
          read(10) nmtx_t
!                write(6,*) 'nmtx_t for chi0mat', nmtx_t
        endif
#ifdef MPI
        call mpi_barrier(MPI_COMM_WORLD,mpierr)
#endif
        do ispin=1,kp%nspin
          if (pol%freq_dep .eq. 0) then
            call read_matrix_d(scal,pol%chi(:,:,ispin),pol%nmtx,iunit)
          endif ! pol%freq_dep .eq. 0
          if (pol%freq_dep .eq. 2) then
            call read_matrix_f(scal,pol%nFreq,pol%os_nfreq_para,pol%chiRDyn(:,:,:,ispin), &
              pol%chiADyn(:,:,:,ispin),pol%nmtx,pol%os_para_freqs,iunit)
          endif ! pol%freq_dep .eq. 2!Note here!
        enddo ! ispin

      else ! is_q0

        iunit=11
        if(peinf%inode.eq.0) then
          write(6,'(1x,a)') 'Reading from file chimat.'
          if (initial_access .eq. 0) then
            call open_file(unit=11,file='chimat',form='unformatted',status='old')
            read(11)
            read(11)
            read(11)
            read(11)
            read(11)
            read(11)
            read(11)
            read(11)
            read(11)
            read(11)
          endif
          read(11)
          read(11) nmtx_t
!                write(6,*) 'nmtx_t for chimat', nmtx_t
        endif
#ifdef MPI
        call mpi_barrier(MPI_COMM_WORLD,mpierr)
#endif
        do ispin=1,kp%nspin
          if (pol%freq_dep .eq. 0) then
            call read_matrix_d(scal,pol%chi(:,:,ispin),pol%nmtx,iunit)
          endif ! pol%freq_dep .eq. 0
          if (pol%freq_dep .eq. 2) then
            call read_matrix_f(scal,pol%nFreq,pol%os_nfreq_para,pol%chiRDyn(:,:,:,ispin),&
                               pol%chiADyn(:,:,:,ispin),pol%nmtx,pol%os_para_freqs,iunit)
          endif ! pol%freq_dep .eq. 2!note here!

        enddo ! ispin
        initial_access = 1
      endif ! is_q0
#ifdef HDF5
      endif
#endif

    endif ! pol%skip_chi

!-----------------------------------------------------------------------
! JRD: Now write out elements that Proc 1 owns

    do ispin = 1, kp%nspin

      if (pol%freq_dep.eq.0 .and. peinf%inode.eq.0) then
        write(7,940) ispin,kp%nspin
        do i=1,scal%npr
          ix=scal%isrtxrow(i)
          do j=1,scal%npc

! JRD: Diagonal, subdiagonal and wings only

            jx=scal%isrtxcol(j)
            if(i.eq.j .or. (gvec%components(1,ix) .eq. 0 .and. gvec%components(2,ix) .eq. 0 .and. gvec%components(3,ix) .eq. 0)) &
              write(7,950) (gvec%components(k,ix),k=1,3),ekin(ix),(gvec%components(k,jx),k=1,3),ekin(jx), &
                            pol%chi(i,j,ispin)
          enddo
        enddo
      endif ! pol%freq_dep.eq.0 .and. peinf%inode.eq.0
      
      if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0) .and. peinf%inode.eq.0) then
        write(7,940) ispin,kp%nspin

        do jj=1,pol%os_nfreq_para
          write(7,*)'frq=',jj
          do i=1,scal%npr
            ix=scal%isrtxrow(i)
            do j=1,scal%npc
              
 !!!JRD: Diagonal and subdiagonal only
   
              jx=scal%isrtxcol(j)
              if(i.eq.j) &
                write(7,950) (gvec%components(k,ix),k=1,3),ekin(ix),(gvec%components(k,jx),k=1,3),ekin(jx), &
                              pol%chiRDyn(jj,i,j,ispin)
            enddo
          enddo
        enddo

#ifdef CPLX
        do jj=1,pol%os_nfreq_para
          write(83,*)'frq=',jj
          do i=1,scal%npr
            ix=scal%isrtxrow(i)
            do j=1,scal%npc

!JRD: Diagonal and subdiagonal only

              jx=scal%isrtxcol(j)
              if(i.eq.j) &
                write(83,950) (gvec%components(k,ix),k=1,3),ekin(ix),(gvec%components(k,jx),k=1,3),ekin(jx), &
                              pol%chiADyn(jj,i,j,ispin)
            enddo
          enddo
        enddo
#endif
!       write(7,940) ispin,kp%nspin
      endif ! (pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 0) .and. peinf%inode.eq.0
     
      if ((pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1).and. peinf%inode.eq.0) then
        write(7,940) ispin,kp%nspin

         do jj=1,pol%os_nfreq_para
           write(7,*) 'frq=',jj
           do i=1,scal%npr
            ix=scal%isrtxrow(i)
            do j=1,scal%npc
              jx=scal%isrtxcol(j)
              if(i.eq.j) &
                write(7,950) (gvec%components(k,ix),k=1,3),ekin(ix),(gvec%components(k,jx),k=1,3),ekin(jx), &
                              pol%chiRDyn(jj,i,j,ispin)
            enddo
          enddo
        enddo

#ifdef CPLX
        do jj=1,pol%os_nfreq_para
           write(83,*) 'frq=',jj
           do i=1,scal%npr
            ix=scal%isrtxrow(i)
            do j=1,scal%npc
              jx=scal%isrtxcol(j)
              if(i.eq.j) &
                write(83,950) (gvec%components(k,ix),k=1,3),ekin(ix),(gvec%components(k,jx),k=1,3),ekin(jx), &
                              pol%chiADyn(jj,i,j,ispin)
            enddo
          enddo
        enddo
#endif
      endif ! (pol%freq_dep .eq. 2).and.(pol%freq_dep_method .eq. 1) .and. peinf%inode.eq.0
 
940   format(/,10x,' independent matrix elements of chi', 7x,'spin index= ',1i1,1x,1i1,/,/,&
        10x,'g',10x,'g**2',10x,'gp',10x,'gp**2',10x,'chi(g,gp)')
! if last value is real, only one of the f13.8 will be used.
950   format(3i5,f10.5,5x,3i5,f10.5,5x,2f15.10)

    enddo ! ispin (loop over spins)

!        write(6,*) 'End Element Writing'


!--------- write polarizability matrix and crystal info to file ---------

    if (pol%skip_epsilon) then
      
#ifdef HDF5
      if (pol%use_hdf5) then
        if (peinf%inode .eq. 0) then
          SAFE_ALLOCATE(isorti, (gvec%ng))
          do i=1,gvec%ng
            isorti(pol%isrtx(i)) = i
          enddo
          call write_gvec_indices_hdf(gvec%ng,pol%isrtx,isorti,ekin,my_iq,TRUNC(filename_chi_hdf5))
          SAFE_DEALLOCATE(isorti)
        endif
        ! FHJ: TODO - write diagonal elements (I actually think this is useless)
        if (pol%freq_dep .eq. 0) then
          do ispin=1,kp%nspin
            ! FHJ: FIXME: We should unify these routines!
#ifdef USESCALAPACK
            call write_matrix_d_par_hdf(scal, pol%chi(:,:,ispin), pol%nmtx, my_iq, ispin, TRUNC(filename_chi_hdf5))
#else
            call write_matrix_d_hdf(scal, pol%chi(:,:,ispin), pol%nmtx, my_iq, ispin, TRUNC(filename_chi_hdf5))
#endif
          enddo
        else
          do ispin=1,kp%nspin
            ! FHJ: FIXME: We should unify these routines!
#ifdef USESCALAPACK
            call write_matrix_f_par_hdf(scal, pol%nFreq, pol%chiRDyn(:,:,:,ispin), &
#ifdef CPLX
              pol%chiADyn(:,:,:,ispin), &
#endif
              pol%nmtx, my_iq, ispin, TRUNC(filename_chi_hdf5))
#else
            call write_matrix_f_hdf(scal, pol%nFreq, pol%chiRDyn(:,:,:,ispin), &
#ifdef CPLX
              pol%chiADyn(:,:,:,ispin), &
#endif
              pol%nmtx, my_iq, ispin, TRUNC(filename_chi_hdf5))
#endif
          enddo
        endif
      else
#endif
        iunit=11
        if (is_q0) iunit=10
        if(peinf%inode.eq.0) then
          write(iunit) syms%ntranq,(((syms%mtrx(i,j,syms%indsub(n)),i=1,3),j=1,3), &
            (syms%tnp(k,syms%indsub(n)),syms%kgzero(k,n),k=1,3),n=1,syms%ntranq)
          np=pol%nmtx*(pol%nmtx+1)/2
          write(iunit) pol%nmtx,np,(pol%isrtx(i),ekin(i),i=1,gvec%ng),(pol%irow(i),i=1,pol%nmtx)
        endif
        do ispin=1,kp%nspin
          if (pol%freq_dep .eq. 0) then
            call write_matrix_d(scal,pol%chi(:,:,ispin),pol%nmtx,iunit)
          endif ! pol%freq_dep .eq. 0
          if (pol%freq_dep .eq. 2) then
            call write_matrix_f(scal,pol%nFreq,pol%chiRDyn(:,:,:,ispin), &
#ifdef CPLX
            pol%chiADyn(:,:,:,ispin),&
#endif
            pol%nmtx, iunit,pol%os_para_freqs)
          endif ! pol%freq_dep .eq. 2
        enddo ! ispin
#ifdef HDF5
      endif
#endif
    endif ! pol%skip_epsilon

! Use pol%chi(j,1) as sum over spin components
! JRD: Why was proc 0 the only one doing this??!!

    if (pol%freq_dep .eq. 0) then
      if(kp%nspin.eq.2) pol%chi(:,:,1)=pol%chi(:,:,1)+pol%chi(:,:,2)
    endif ! pol%freq_dep .eq. 0
    if (pol%freq_dep .eq. 2) then
      if(kp%nspin.eq.2) pol%chiRDyn(:,:,:,1)=pol%chiRDyn(:,:,:,1)+pol%chiRDyn(:,:,:,2)
#ifdef CPLX
      if(kp%nspin.eq.2) pol%chiADyn(:,:,:,1)=pol%chiADyn(:,:,:,1)+pol%chiADyn(:,:,:,2)
#endif
    endif ! pol%freq_dep .eq. 2

    if (peinf%inode .eq. 0) then
      call timacc(13,1)
#ifdef VERBOSE
      call logit('Calling epsinv')
#endif
    endif ! peinf%inode .eq. 0

    if (.not. pol%skip_epsilon) then
        call epsinv(gvec,pol,ekin,qq,is_q0,crys,scal,kp,omega_plasma,iq)

      if(peinf%inode.eq.0) then
#ifdef VERBOSE
        call logit('Finished epsinv')
#endif
        call timacc(13,2)
      endif ! peinf%inode.eq.0
      
    endif ! pol%skip_epsilon

    if (pol%freq_dep .eq. 0) then
      SAFE_DEALLOCATE_P(pol%chi)
    endif
    if (pol%freq_dep .eq. 2) then
      SAFE_DEALLOCATE_P(pol%chiRDyn)
#ifdef CPLX
      SAFE_DEALLOCATE_P(pol%chiADyn)
#endif
      if(pol%freq_dep_method .eq. 1) then
        SAFE_DEALLOCATE_P(pol%chiTDyn)
      endif
    endif
    if (pol%freq_dep .eq. 3) then
      SAFE_DEALLOCATE_P(pol%chiRDyn)
    endif


    SAFE_DEALLOCATE(indrk)
    SAFE_DEALLOCATE(neq)

    SAFE_DEALLOCATE_P(pol%isrtx)
    SAFE_DEALLOCATE_P(pol%isrtxi)
    SAFE_DEALLOCATE_P(pol%irow)
    SAFE_DEALLOCATE_P(scal%isrtxcol)
    SAFE_DEALLOCATE_P(scal%isrtxrow)
    SAFE_DEALLOCATE_P(scal%imycol)
    SAFE_DEALLOCATE_P(scal%imyrow)
    SAFE_DEALLOCATE_P(scal%imycolinv)
    SAFE_DEALLOCATE_P(scal%imyrowinv)
    SAFE_DEALLOCATE_P(scal%imycold)
    SAFE_DEALLOCATE_P(scal%imyrowd)

#ifdef HDF5
    if (pol%use_hdf5.and.peinf%inode==0) call set_qpt_done(TRUNC(filename_out_hdf5), my_iq)
#endif

  enddo ! iq (loop over q points)

  SAFE_DEALLOCATE_P(scal%nprd)
  SAFE_DEALLOCATE_P(scal%npcd)
  call dealloc_grid(gr)

! End q point loop!
!-------------------------------------------------------------------


!----------- Clean House -------------------------------------------

#ifdef VERBOSE
  call logit('Cleaning up')
#endif

  if (.not. pol%skip_epsilon) then
    call destroy_qran()
  endif

  if(.not. pol%skip_chi) then
    if(peinf%inode == 0) call close_file(17) ! file chi_converge.dat
    call destroy_fftw_plans()
  endif
  if (pol%iwritecoul .eq. 1) then
    if (peinf%inode .eq. 0) then
      call close_file(19) ! file vcoul
    endif
  endif

  if (peinf%inode .eq. 0 .and. pol%freq_dep .eq. 2) then 
    call close_file(51) !file EpsInvDyn
    call close_file(52) !file EpsDyn 
  endif


  SAFE_DEALLOCATE(ekin)
  SAFE_DEALLOCATE_P(kp%w)
  SAFE_DEALLOCATE_P(kp%rk)
  SAFE_DEALLOCATE_P(kp%el)

  if(pol%need_WFNq) then
    SAFE_DEALLOCATE_P(kpq%w)
    SAFE_DEALLOCATE_P(kpq%rk)
    SAFE_DEALLOCATE_P(kpq%el)
  endif
  SAFE_DEALLOCATE_P(gvec%components)
  SAFE_DEALLOCATE_P(gvec%index_vec)
  SAFE_DEALLOCATE_P(pol%qpt)
  SAFE_DEALLOCATE_P(vwfn%isort)
  SAFE_DEALLOCATE_P(cwfn%isort)
  SAFE_DEALLOCATE_P(cwfn%band_index)
  SAFE_DEALLOCATE_P(peinf%global_nvown)
  SAFE_DEALLOCATE_P(peinf%global_ncown)
  SAFE_DEALLOCATE_P(peinf%indexc)
  SAFE_DEALLOCATE_P(peinf%indexv)
  SAFE_DEALLOCATE_P(peinf%global_indexv)
  SAFE_DEALLOCATE_P(peinf%invindexv)
  SAFE_DEALLOCATE_P(peinf%invindexc)
  SAFE_DEALLOCATE_P(peinf%doiownv)
  SAFE_DEALLOCATE_P(peinf%doiownc)
  SAFE_DEALLOCATE_P(peinf%does_it_ownv)
  SAFE_DEALLOCATE_P(peinf%does_it_ownc)
  SAFE_DEALLOCATE_P(peinf%global_pairowner)
  SAFE_DEALLOCATE_P(pol%dFreqGrid)
  SAFE_DEALLOCATE_P(pol%dFreqBrd)
  if(peinf%inode.eq.0) then
    SAFE_DEALLOCATE_P(pol%nmtx_of_q)
    call close_file(7) ! epsilon.log
#ifdef CPLX
    if(pol%freq_dep == 2) call close_file(83) !polchiA.log
#endif

    if (pol%skip_epsilon.and..not.pol%use_hdf5) then      
      if (pol%nq0>0) call close_file(10) ! chi0mat
      if (pol%nq1>0) call close_file(11) ! chimat 
    else 
      if (.not.pol%use_hdf5) then
        if (pol%nq0>0) call close_file(12) ! eps0mat 
        if (pol%nq1>0) call close_file(13) ! epsmat 
      endif
    endif ! pol%skip_epsilon 
  endif

  call free_wfns(pol, intwfnv, intwfnvq, intwfnc, .true.)

!------------- Print Timing Info -----------------------------------------

#ifdef MPI
  call MPI_barrier(MPI_COMM_WORLD,mpierr)
#endif

#ifdef VERBOSE
  call logit('Calculating Timing Info')
#endif

  routnam(1)='TOTAL:'
  routnam(2)='INPUT:'
  routnam(3)='INPUT_Q:'
  routnam(4)='FULLBZ:'
  routnam(5)='GVEC:'
  routnam(6)='SUBGRP:'
  routnam(8)='IRRBZ:'
  routnam(9)='GENWF:'
  routnam(10)='MTXEL:'
  routnam(11)='RQSTAR:'
  routnam(12)='GMAP:'
  routnam(13)='EPSINV (TOTAL):'
  routnam(14)='CHI SUM (COMM):'
  routnam(15)='CHI SUM (TOTAL):'
  routnam(16)='GENWF (VAL):'
  routnam(17)='GENWF (COND):'
  routnam(18)='EPSINV (VCOUL):'
  routnam(19)='JOB SETUP:'
  routnam(20)='Q LOOP SETUP:'
  routnam(21)='INIT CUTOFF:'
  routnam(22)='INIT SCALAPACK:'
  routnam(23)='INIT ARRAYS:'
  routnam(24)='CONVERGE TESTS:'
  routnam(25)='MTXEL (DENOM):'
  routnam(26)='MTXEL (FFT):'
  routnam(28)='GENWF (C-Ekin):'
  routnam(29)='GENWF (C-Sort):'
  routnam(30)='CHI SUM (' // TOSTRING(X(GEMM)) // '):'
  routnam(31)='CHI SUM (PREP):'
  routnam(32)='MTXEL EXP(DENOM):'
  routnam(33)='MTXEL EXP (FFT):'
  routnam(40)='OPT FFT:'
  routnam(41)='OPT FFT (INIT):'
  routnam(42)='OPT FFT (COMM_FFT):'
  routnam(43)='OPT FFT (FFT):'
  routnam(44)='CHI SUM (ARRAY ALLOC):'
  routnam(45)='EPSINV (I/O)'
  routnam(46)='EPSINV (INVERT)'
  routnam(47)='EPS (I/O) COMM'
  routnam(48)='EPS (I/O) IO'
  routnam(51)='CHI SUM (ROW):'
  routnam(52)='CHI SUM (COLUMN):'
  routnam(53)='CHI SUM (HT):'
  routnam(81)='INPUT(Q) I/O'
  routnam(82)='INPUT(Q) COMM'
  routnam(91)='FFT ZERO'
  routnam(92)='FFT PUT'
  routnam(93)='FFT PLAN'
  routnam(94)='FFT EXEC'
  routnam(95)='FFT MLTPLY'

  routsrt=(/ 1,4,8,6,11,12,5,2,3,81,82,9,16,17,28,29,19,20,21,22,23, &
    10,25,26,32,33,15,14,31,44,51,52,30,53,13,18,45,46,47,48,24,40,41,42,43, &
    91,92,93,94,95 /)

  if(peinf%inode.eq.0) then
    call timacc(1,2)
    write(6,*)
    write(6,9000) 'CPU [s]','WALL [s]','#'
    write(6,*)
    do i=2,ubound(routsrt, 1)
      call timacc(routsrt(i),3,tsec,ncount)
      write(6,9001) routnam(routsrt(i)),tsec(1),tsec(2),ncount
    enddo
    write(6,*)
    call timacc(routsrt(1),3,tsec,ncount)
    write(6,9002) routnam(routsrt(1)),tsec(1),tsec(2)
    write(6,*)
    write(6,*)
9000 format(24x,a13,  3x,a13,  3x,a9)
9001 format(a24,f13.3,3x,f13.3,3x,i9)
9002 format(a24,f13.3,3x,f13.3)
  endif

  call write_memory_usage()
  
  if (pol%iwriteint .eq. 0) then
    iunit_c = 100028+peinf%inode
    write(filename,'(a,i4.4)') 'INT_CWFN_', peinf%inode
    call open_file(iunit_c, file = filename, status='old')
    call close_file(iunit_c, delete = .true.) ! files INT_CWFN_*

    if(peinf%inode == 0) then
      iunit_v = 200028
      call open_file(iunit_v, file = 'INT_VWFN', status='old')
      call close_file(iunit_v, delete = .true.) ! files INT_VWFN

      ! this is the condition for calling input_q above
      if(pol%need_WFNq) then
        iunit_v = 300028
        call open_file(iunit_v, file = 'INT_VWFQ', status='old')
        call close_file(iunit_v, delete = .true.) ! files INT_VWFQ
      endif
    endif
  endif

!-------------------------------
! JIM: Close HDF interface
#ifdef HDF5
  if(pol%use_hdf5.or.pol%os_hdf5) call h5close_f(hdf5_error)
#endif
  
#ifdef MPI
  call MPI_Finalize(mpierr)
#endif

end program epsilon
