from mpi4py import MPI
import numpy as np
from scipy import ndimage
import ucf
import h5py
import os
import resource

class ibmppp:
    """Parallel Python Postprocessor for the IBM-DNS code"""
    def __init__(self,comm,dir_base,flowType,chunksPerProc=(1,1,1),numberOfGhosts=(1,1,1),precision='float64'):
        """Class constructor"""
        # Save method arguments for later use
        self.__comm = comm
        self.__rank = comm.Get_rank()
        self.__nproc = comm.Get_size()
        self.__dir_base = dir_base
        self.__nxppp = chunksPerProc[0]
        self.__nyppp = chunksPerProc[1]
        self.__nzppp = chunksPerProc[2]
        self.__nghx = numberOfGhosts[0]
        self.__nghy = numberOfGhosts[1]
        self.__nghz = numberOfGhosts[2]
        self.__precision = precision
        # Declare variables which hold fields
        self.field = {'u': None, 'v': None, 'w': None, 'p': None, 's1': None}
        # Declare variables which hold statistics
        self.spatialMean    = {'u': None, 'v': None, 'w': None, 'p': None, 's1': None}
        self.spatialMsqr    = {'u': None, 'v': None, 'w': None, 'p': None, 's1': None}
        self.spatialNsample = {'u': None, 'v': None, 'w': None, 'p': None, 's1': None}
        # Declare variables for grid (to be set by "loadGrid()")
        self.grid = {'u': [None,None,None], 'v':[None,None,None], 'w': [None,None,None], 'p': [None,None,None], 's1': [None,None,None]}
        self.domainBounds = (None,None,None,None,None,None)
        self.__boundarySymmetries = {'u': None, 'v': None, 'w': None, 'p': None, 's1': None}
        self.__xperiodic = None
        self.__yperiodic = None
        self.__zperiodic = None
        self.localGrid = {
          'u':  [None,None,None],
          'v':  [None,None,None],
          'w':  [None,None,None],
          'p':  [None,None,None],
          's1': [None,None,None]
        } # (ibeg,iend,jbeg,jend,kbeg,kend) of original processor grid
        self.__dx = {'u': None, 'v': None, 'w': None, 'p': None, 's1': None}
        # Declare internal variables for processor grid (to be set by "loadProcGrid()")
        self.__procGrid1 = {
          'u':  [None,None,None,None,None,None],
          'v':  [None,None,None,None,None,None],
          'w':  [None,None,None,None,None,None],
          'p':  [None,None,None,None,None,None],
          's1': [None,None,None,None,None,None]
        } # (ibeg,iend,jbeg,jend,kbeg,kend) of original processor grid
        self.__procGrid = {
          'u':  [None,None,None,None,None,None],
          'v':  [None,None,None,None,None,None],
          'w':  [None,None,None,None,None,None],
          'p':  [None,None,None,None,None,None],
          's1': [None,None,None,None,None,None]
        } # (ibeg,iend,jbeg,jend,kbeg,kend) of internal processor grid
        self.__nproc1 = None
        self.__nxp1 = None; self.__nyp1 = None; self.__nzp1 = None
        self.__nxp = None; self.__nyp = None; self.__nzp = None
        self.__ipbeg = None; self.__jpbeg = None; self.__kpbeg = None
        self.__nghbr = None
        # Declare internal variables for local grid indices held by this processor
        self.__localChunkBounds = {
          'u':  [None,None,None,None,None,None],
          'v':  [None,None,None,None,None,None],
          'w':  [None,None,None,None,None,None],
          'p':  [None,None,None,None,None,None],
          's1': [None,None,None,None,None,None]
        } # (ib,ie,jb,je,kb,ku) grid indices of local chunk
        self.__localChunkSize = {
          'u':  [None,None,None],
          'v':  [None,None,None],
          'w':  [None,None,None],
          'p':  [None,None,None],
          's1': [None,None,None]
        } # (nxl,nyl,nzl) size of local chunk (without ghost cells)
        # Particles related quantities
        self.__npartg, self.__npartl, self.__ncol = None, None, None
        self.particles = None
        self.col = None
        # Evaluate flow type and set boundary conditions
        self.__setBoundaryConditions(flowType)
        # Prepare the processor layout
        self.loadProcGrid()
        # Load grid
        self.loadGrid()

    def loadProcGrid(self):
        '''Read input processor grid, compute processor grid for workers'''
        if self.__rank==0:
            # Read the processor grid
            file_proc = self.__dir_base+'/proc.bin'
            (self.__procGrid1['u'][0], self.__procGrid1['u'][1], self.__procGrid1['u'][2], self.__procGrid1['u'][3], self.__procGrid1['u'][4], self.__procGrid1['u'][5],
             self.__procGrid1['v'][0], self.__procGrid1['v'][1], self.__procGrid1['v'][2], self.__procGrid1['v'][3], self.__procGrid1['v'][4], self.__procGrid1['v'][5],
             self.__procGrid1['w'][0], self.__procGrid1['w'][1], self.__procGrid1['w'][2], self.__procGrid1['w'][3], self.__procGrid1['w'][4], self.__procGrid1['w'][5],
             self.__procGrid1['p'][0], self.__procGrid1['p'][1], self.__procGrid1['p'][2], self.__procGrid1['p'][3], self.__procGrid1['p'][4], self.__procGrid1['p'][5],
             self.__procGrid1['s1'][0],self.__procGrid1['s1'][1],self.__procGrid1['s1'][2],self.__procGrid1['s1'][3],self.__procGrid1['s1'][4],self.__procGrid1['s1'][5]) = ucf.readProcgrid(file_proc)
            # Verify that this processor grid can be redistributed onto the requested processor layout
            self.__nxp1 = len(self.__procGrid1['u'][0])
            self.__nyp1 = len(self.__procGrid1['u'][2])
            self.__nzp1 = len(self.__procGrid1['u'][4])
            self.__nproc1 = self.__nxp1*self.__nyp1*self.__nzp1
            if self.__nxp1%self.__nxppp != 0:
              raise ValueError('Number of processors must be divisible by the number of processors per process. (nxp={}, nxppp={})'.format(self.__nxp1,self.__nxppp))
            if self.__nyp1%self.__nyppp != 0:
              raise ValueError('Number of processors must be divisible by the number of processors per process. (nyp={}, nyppp={})'.format(self.__nyp1,self.__nyppp))
            if self.__nzp1%self.__nzppp != 0:
              raise ValueError('Number of processors must be divisible by the number of processors per process. (nzp={}, nzppp={})'.format(self.__nzp1,self.__nzppp))
            # Determine the new processor layout and verify total number of MPI processes
            self.__nxp = self.__nxp1//self.__nxppp
            self.__nyp = self.__nyp1//self.__nyppp
            self.__nzp = self.__nzp1//self.__nzppp
            if self.__nproc!=self.__nxp*self.__nyp*self.__nzp:
              raise ValueError('Number of MPI processes does not match the requested processor layout. (MPI procs: {}, required procs: {}'.format(self.__nproc,self.__nxp*self.__nyp*self.__nzp))
            self.__ipbeg = range(0,self.__nxp1,self.__nxppp)
            self.__jpbeg = range(0,self.__nyp1,self.__nyppp)
            self.__kpbeg = range(0,self.__nzp1,self.__nzppp)
            for key in self.__procGrid1:
              self.__procGrid[key][0] = self.__procGrid1[key][0][0::self.__nxppp]
              self.__procGrid[key][1] = self.__procGrid1[key][1][self.__nxppp-1::self.__nxppp]
              self.__procGrid[key][2] = self.__procGrid1[key][2][0::self.__nyppp]
              self.__procGrid[key][3] = self.__procGrid1[key][3][self.__nyppp-1::self.__nyppp]
              self.__procGrid[key][4] = self.__procGrid1[key][4][0::self.__nzppp]
              self.__procGrid[key][5] = self.__procGrid1[key][5][self.__nzppp-1::self.__nzppp]
        # Broadcast the data
        self.__procGrid1 = self.__comm.bcast(self.__procGrid1,root=0)
        self.__procGrid  = self.__comm.bcast(self.__procGrid,root=0)
        (self.__nxp1,self.__nyp1,self.__nzp1) = self.__comm.bcast((self.__nxp1,self.__nyp1,self.__nzp1),root=0)
        (self.__nxp, self.__nyp ,self.__nzp ) = self.__comm.bcast((self.__nxp ,self.__nyp ,self.__nzp ),root=0)
        (self.__ipbeg,self.__jpbeg,self.__kpbeg) = self.__comm.bcast((self.__ipbeg,self.__jpbeg,self.__kpbeg),root=0)
        # Get position in processor grid
        (self.__icol,self.__jrow,self.__kpln) = self.__positionFromRank(self.__rank,self.__nxp,self.__nyp,self.__nzp)
        # Determine local grid indices and size
        for key in self.__procGrid:
          self.__localChunkBounds[key][0] = self.__procGrid[key][0][self.__icol]
          self.__localChunkBounds[key][1] = self.__procGrid[key][1][self.__icol]
          self.__localChunkBounds[key][2] = self.__procGrid[key][2][self.__jrow]
          self.__localChunkBounds[key][3] = self.__procGrid[key][3][self.__jrow]
          self.__localChunkBounds[key][4] = self.__procGrid[key][4][self.__kpln]
          self.__localChunkBounds[key][5] = self.__procGrid[key][5][self.__kpln]
          self.__localChunkSize[key][0] = self.__localChunkBounds[key][1] - self.__localChunkBounds[key][0] + 1
          self.__localChunkSize[key][1] = self.__localChunkBounds[key][3] - self.__localChunkBounds[key][2] + 1
          self.__localChunkSize[key][2] = self.__localChunkBounds[key][5] - self.__localChunkBounds[key][4] + 1
          # Verify that local grid size is not smaller than ghost cell size
          if (self.__localChunkSize[key][0]<self.__nghx or
              self.__localChunkSize[key][1]<self.__nghy or
              self.__localChunkSize[key][2]<self.__nghz):
            raise ValueError('Local grid size must be greater than number of ghost cells in each direction!')
        # Initialize neighbor array
        self.__initializeNeighbors()

    def loadGrid(self):
        '''Read and distribute physical grid'''
        buffer = None
        if self.__rank==0:
            # Read the grid
            file_grid = self.__dir_base+'/grid.bin'
            buffer = ucf.readGrid(file_grid)
        # Broadcast the data
        buffer = self.__comm.bcast(buffer,root=0)
        # Unpack the buffer
        (self.grid['u'][0], self.grid['u'][1], self.grid['u'][2],
         self.grid['v'][0], self.grid['v'][1], self.grid['v'][2],
         self.grid['w'][0], self.grid['w'][1], self.grid['w'][2],
         self.grid['p'][0], self.grid['p'][1], self.grid['p'][2],
         self.grid['s1'][0],self.grid['s1'][1],self.grid['s1'][2]) = buffer
        # Compute dx per grid
        for key in self.grid:
          self.__dx[key] = self.grid[key][0][1]-self.grid[key][0][0]
        # Compute domain bounds
        if self.__xperiodic:
          a,b = 0.0, self.grid['u'][0][-1]+self.__dx['u']
        else:
          a,b = self.grid['u'][0][0]+0.5*self.__dx['u'], self.grid['u'][0][-1]-0.5*self.__dx['u']
        if self.__yperiodic:
          c,d = 0.0, self.grid['v'][1][-1]+self.__dx['v']
        else:
          c,d = self.grid['v'][1][0]+0.5*self.__dx['v'], self.grid['v'][1][-1]-0.5*self.__dx['v']
        if self.__zperiodic:
          e,f = 0.0, self.grid['w'][2][-1]+self.__dx['w']
        else:
          e,f = self.grid['w'][2][0]+0.5*self.__dx['w'], self.grid['w'][2][-1]-0.5*self.__dx['w']
        self.domainBounds = (a,b,c,d,e,f)
        # Determine local grid arrays and dx per field
        for key in self.grid:
          self.localGrid[key][0] = np.concatenate(
            (np.arange(-self.__nghx,0)*self.__dx[key]+self.grid[key][0][self.__localChunkBounds[key][0]-1],  # lower ghost cells
             self.grid[key][0][self.__localChunkBounds[key][0]-1:self.__localChunkBounds[key][1]],           # chunk grid
             np.arange(1,self.__nghx+1)*self.__dx[key]+self.grid[key][0][self.__localChunkBounds[key][1]-1]) # upper ghost cells
            )
          self.localGrid[key][1] = np.concatenate(
            (np.arange(-self.__nghy,0)*self.__dx[key]+self.grid[key][1][self.__localChunkBounds[key][2]-1],  # lower ghost cells
             self.grid[key][1][self.__localChunkBounds[key][2]-1:self.__localChunkBounds[key][3]],           # chunk grid
             np.arange(1,self.__nghy+1)*self.__dx[key]+self.grid[key][1][self.__localChunkBounds[key][3]-1]) # upper ghost cells
            )
          self.localGrid[key][2] = np.concatenate(
            (np.arange(-self.__nghz,0)*self.__dx[key]+self.grid[key][2][self.__localChunkBounds[key][4]-1],  # lower ghost cells
             self.grid[key][2][self.__localChunkBounds[key][4]-1:self.__localChunkBounds[key][5]],           # chunk grid
             np.arange(1,self.__nghz+1)*self.__dx[key]+self.grid[key][2][self.__localChunkBounds[key][5]-1]) # upper ghost cells
            )

    def loadParticles(self):
        '''Reads particles from file and distributes them to the correct processor.'''
        # Rank 0 reads the file and does the redistribution.
        # There is No need to distinguish between master and slave particles here, since we will
        # only use them for masking the fields. Also we do not care about particles in ghost cells.
        if self.__rank==0:
          # Setup MPI send request list
          reqsend = []
          # Read input file
          file_input = self.__dir_base+'particles.bin'
          pp,col = ucf.readParticles(file_input,step=1)
          # Remove time dimension, because we are dealing with a single time step exclusively
          pp = pp[:,:,0]
          self.__ncol = pp.shape[0]
          self.__npartg = pp.shape[1]
        # Broadcast total number of particles and columnes
        self.__ncol,self.__npartg = self.__comm.bcast((self.__ncol,self.__npartg),root=0)
        # Distribute particles among processors
        if self.__rank==0:
          # Duplicate particles over periodic boundaries and assign them a negative ID
          dirkey = ['x','y','z']
          isPeriodic = (self.__xperiodic,self.__yperiodic,self.__zperiodic)
          for idir in range(0,3):
            if not isPeriodic[idir]:
              continue
            # Get domain boundaries in this direction
            bdmin = self.domainBounds[2*idir]
            bdmax = self.domainBounds[2*idir+1]
            # Shift and append particles which are within one diameter from the lower bound
            li_part = (pp[col[dirkey[idir]],:]<=bdmin+2.0*pp[col['r'],:])
            pptmp = pp[:,li_part].copy()
            pptmp[col[dirkey[idir]],:] += bdmax-bdmin
            pptmp[col['id'],:] = -np.abs(pptmp[col['id'],:])
            pp = np.concatenate((pp,pptmp),axis=1)
            # Shift and append particles which are within one diameter from the upper bound
            li_part = (pp[col[dirkey[idir]],:]>=bdmax-2.0*pp[col['r'],:])
            pptmp = pp[:,li_part].copy()
            pptmp[col[dirkey[idir]],:] -= bdmax-bdmin
            pptmp[col['id'],:] = -np.abs(pptmp[col['id'],:])
            pp = np.concatenate((pp,pptmp),axis=1)
          # Go through individual chunks and distribute particles
          for ip in range(0,self.__nxp):
            for jp in range(0,self.__nyp):
              for kp in range(0,self.__nzp):
                # Determine rank
                rank_dst = self.__rankFromPosition(ip,jp,kp,self.__nxp,self.__nyp,self.__nzp)
                pos_dst = (ip,jp,kp)
                # Determine chunk boundaries
                bdmin = [ np.inf,  np.inf,  np.inf]
                bdmax = [-np.inf, -np.inf, -np.inf]
                for key in self.grid:
                  for idir in range(0,3):
                    bdmin[idir] = np.minimum(bdmin[idir],self.grid[key][idir][self.__procGrid[key][2*idir][pos_dst[idir]]-1])
                    bdmax[idir] = np.maximum(bdmax[idir],self.grid[key][idir][self.__procGrid[key][2*idir+1][pos_dst[idir]]-1]+self.__dx[key])
                # Find particles which affect that chunk
                li_part =  ((pp[col['x'],:]>=bdmin[0]-pp[col['r'],:]) &
                            (pp[col['x'],:]<=bdmax[0]+pp[col['r'],:]) &
                            (pp[col['y'],:]>=bdmin[1]-pp[col['r'],:]) &
                            (pp[col['y'],:]<=bdmax[1]+pp[col['r'],:]) &
                            (pp[col['z'],:]>=bdmin[2]-pp[col['r'],:]) &
                            (pp[col['z'],:]<=bdmax[2]+pp[col['r'],:]))
                # Send them to that processor
                sendbuf = (np.ascontiguousarray(pp[:,li_part]),col)
                reqsend.append(self.__comm.isend(sendbuf,dest=rank_dst))
        # Every rank needs to receive the particles, rank 0 send them to itself
        buffsize = 32*1024*1024
        reqrecv = self.__comm.irecv(buffsize,source=0)
        (self.particles,self.col) = reqrecv.wait()
        # Compute local number of particles (including ghost particles)
        self.__npartl = self.particles.shape[1]
        # Wait for everyone to finish
        if self.__rank==0:
          MPI.Request.waitall(reqsend)

    def loadField(self,key):
        '''Reads chunks from files'''
        # Determine which chunks are to be loaded by this processor
        (ip,jp,kp) = self.__positionFromRank(self.__rank,self.__nxp,self.__nyp,self.__nzp)
        ipb1 = self.__ipbeg[ip]
        jpb1 = self.__jpbeg[jp]
        kpb1 = self.__kpbeg[kp]
        ipe1 = ipb1+self.__nxppp-1
        jpe1 = jpb1+self.__nyppp-1
        kpe1 = kpb1+self.__nzppp-1
        # Get grid bounds for the field to be loaded.
        # Also determine file which contains the desired field as well as the
        # dataset ID
        if key[0]=='u':
          filebase = self.__dir_base+'/uvwp.'
          dset = 1
        elif key[0]=='v':
          filebase = self.__dir_base+'/uvwp.'
          dset = 2
        elif key[0]=='w':
          filebase = self.__dir_base+'/uvwp.'
          dset = 3
        elif key[0]=='p':
          filebase = self.__dir_base+'/uvwp.'
          dset = 4
        elif key[0]=='s':
          filebase = self.__dir_base+'/scal.'
          dset = int(key[1])
          if dset!=1:
            self.fields[key] = None
            self.grid[key] = self.grid['s1']
            self.__procGrid1[key] = self.__procGrid1['s1']
            self.__procGrid[key] = self.__procGrid['s1']
            self.__localChunkBounds[key] = self.__localChunkBounds['s1']
            self.__localChunkSize[key] = self.__localChunkSize['s1']
        else:
          raise ValueError('Unknown field: {}'.format(field))
        ib = self.__procGrid1[key][0][ipb1]
        ie = self.__procGrid1[key][1][ipe1]
        jb = self.__procGrid1[key][2][jpb1]
        je = self.__procGrid1[key][3][jpe1]
        kb = self.__procGrid1[key][4][kpb1]
        ke = self.__procGrid1[key][5][kpe1]
        nxl = ie-ib+1
        nyl = je-jb+1
        nzl = ke-kb+1
        # Allocate an array to hold the field
        self.field[key] = np.empty((nxl+2*self.__nghx,nyl+2*self.__nghy,nzl+2*self.__nghz),dtype=self.__precision,order='F')
        # Go through each chunk to be read and construct the field
        for ip1 in range(ipb1,ipe1+1):
          for jp1 in range(jpb1,jpe1+1):
            for kp1 in range(kpb1,kpe1+1):
              # Determine rank of the chunk to be read
              iprank = self.__rankFromPosition(ip1,jp1,kp1,self.__nxp1,self.__nyp1,self.__nzp1)
              file_input = filebase+'{:05d}'.format(iprank)
              # Compute bounds of this chunk
              ib1 = self.__procGrid1[key][0][ip1]
              ie1 = self.__procGrid1[key][1][ip1]
              jb1 = self.__procGrid1[key][2][jp1]
              je1 = self.__procGrid1[key][3][jp1]
              kb1 = self.__procGrid1[key][4][kp1]
              ke1 = self.__procGrid1[key][5][kp1]
              # Read chunk and check its validity
              [chunk] = ucf.readFieldChunk(file_input,dset=dset)
              if ib1!=chunk['ibeg'] or ie1!=chunk['ibeg']+chunk['nxl']-1:
                  raise ValueError('Loaded chunk does not correspond to processor grid (ibeg={}, nxl={} vs ibeg={},nxl={}'.format(ib1,ie1-ib1+1,chunk['ibeg'],chunk['nxl']))
              if jb1!=chunk['jbeg'] or je1!=chunk['jbeg']+chunk['nyl']-1:
                  raise ValueError('Loaded chunk does not correspond to processor grid (jbeg={}, nyl={} vs jbeg={},nyl={}'.format(jb1,je1-jb1+1,chunk['jbeg'],chunk['nyl']))
              if kb1!=chunk['kbeg'] or ke1!=chunk['kbeg']+chunk['nzl']-1:
                  raise ValueError('Loaded chunk does not correspond to processor grid (kbeg={}, nzl={} vs kbeg={},nzl={}'.format(kb1,ke1-kb1+1,chunk['kbeg'],chunk['nzl']))
              ngh1 = chunk['ighost']
              ibeg = [i + ngh1 for i in [0,0,0]]
              iend = [i-ngh1 for i in chunk['data'].shape]
              # No need to care about ghost cells here, we communicate them later
              self.field[key][ib1-ib+self.__nghx:ie1-ib+self.__nghx+1,
                              jb1-jb+self.__nghy:je1-jb+self.__nghy+1,
                              kb1-kb+self.__nghz:ke1-kb+self.__nghz+1] = chunk['data'][ibeg[0]:iend[0],\
                                                                                       ibeg[1]:iend[1],\
                                                                                       ibeg[2]:iend[2]]
        # Exchange ghost cells and set boundary conditions
        self.exchangeGhostCells(key)
        self.imposeBoundaryConditions(key)

    def loadStatistics(self,filename,key):
      '''Loads statistics from a h5 file.'''
      if self.__rank==0:
        # Construct file name
        file_h5 = filename+'.h5'
        # Open the file and write data
        fid = h5py.File(file_h5,'r')
        gid = fid[key]
        # Load mean
        did = gid['mean'];
        fcont = did.attrs['F_CONTIGUOUS']
        dims  = did.attrs['DIM']
        order = 'F' if fcont else 'C'
        self.spatialMean[key] = np.reshape(did,dims,order=order)
        # Load meansquare
        did = gid['meansquare'];
        fcont = did.attrs['F_CONTIGUOUS']
        dims  = did.attrs['DIM']
        order = 'F' if fcont else 'C'
        self.spatialMsqr[key] = np.reshape(did,dims,order=order)
        # Load nsample
        did = gid['nsample'];
        fcont = did.attrs['F_CONTIGUOUS']
        dims  = did.attrs['DIM']
        order = 'F' if fcont else 'C'
        self.spatialNsample[key] = np.reshape(did,dims,order=order)
        # Close the file
        fid.close()

    def saveField(self,filename,key,append=False,keepAllGhost=False,xdmf=False):
      '''Writes chunks in new processor layout to h5 files'''
      # Determine filename of output file
      file_chunk = filename+'.{:05d}'.format(self.__rank)
      # Do we write all ghosts, or just a single ghost cell?
      if keepAllGhost:
        ii_rng = slice(0,2*self.__nghx+self.__localChunkSize[key][0])
        jj_rng = slice(0,2*self.__nghy+self.__localChunkSize[key][1])
        kk_rng = slice(0,2*self.__nghz+self.__localChunkSize[key][2])
        nghx = self.__nghx
        nghy = self.__nghy
        nghz = self.__nghz
      else:
        ii_rng = slice(self.__nghx-1,self.__nghx+self.__localChunkSize[key][0]+1)
        jj_rng = slice(self.__nghy-1,self.__nghy+self.__localChunkSize[key][1]+1)
        kk_rng = slice(self.__nghz-1,self.__nghz+self.__localChunkSize[key][2]+1)
        nghx = 1
        nghy = 1
        nghz = 1
      # Compute global nx,ny,nz
      nxg = self.__procGrid[key][1][-1]-self.__procGrid[key][0][0]+1
      nyg = self.__procGrid[key][3][-1]-self.__procGrid[key][2][0]+1
      nzg = self.__procGrid[key][5][-1]-self.__procGrid[key][4][0]+1
      # If append flag is set, we add a dataset to the existing file
      if append:
        ioflag = 'a'
      else:
        ioflag = 'w'
      # Open the file and write data, then close it because we are done
      fid = h5py.File(file_chunk,ioflag)
      if not append:
        fid.create_dataset('rank',data=self.__rank)
        fid.create_dataset('iproc',data=self.__icol)
        fid.create_dataset('jproc',data=self.__jrow)
        fid.create_dataset('kproc',data=self.__kpln)
        fid.create_dataset('nxproc',data=self.__nxp)
        fid.create_dataset('nyproc',data=self.__nyp)
        fid.create_dataset('nzproc',data=self.__nzp)
      gid = fid.create_group('/'+key)
      gid.create_dataset('ib',data=self.__localChunkBounds[key][0])
      gid.create_dataset('ie',data=self.__localChunkBounds[key][1])
      gid.create_dataset('jb',data=self.__localChunkBounds[key][2])
      gid.create_dataset('je',data=self.__localChunkBounds[key][3])
      gid.create_dataset('kb',data=self.__localChunkBounds[key][4])
      gid.create_dataset('ke',data=self.__localChunkBounds[key][5])
      gid.create_dataset('nxl',data=self.__localChunkSize[key][0])
      gid.create_dataset('nyl',data=self.__localChunkSize[key][1])
      gid.create_dataset('nzl',data=self.__localChunkSize[key][2])
      gid.create_dataset('nx',data=nxg)
      gid.create_dataset('ny',data=nyg)
      gid.create_dataset('nz',data=nzg)
      gid.create_dataset('ighost',data=(1,),dtype='i')
      gid.create_dataset('nghostx',data=nghx,dtype='i')
      gid.create_dataset('nghosty',data=nghy,dtype='i')
      gid.create_dataset('nghostz',data=nghz,dtype='i')
      did = gid.create_dataset('data',data=self.field[key][ii_rng,jj_rng,kk_rng].flatten('F'))
      did.attrs['F_CONTIGUOUS'] = 1
      did.attrs['DIM'] = (self.__localChunkSize[key][0]+2*nghx,self.__localChunkSize[key][1]+2*nghy,self.__localChunkSize[key][2]+2*nghz)
      fid.close()
      # Create an XDMF file for the field
      if xdmf and self.__rank==0:
        # Construct XDMF filename
        file_xdmf = filename+'_'+key+'.xdmf'
        # Open file and write header
        fid = open(file_xdmf,'w')
        fid.write('<?xml version="1.0"?>\n')
        fid.write('<Xdmf Version="2.0">\n')
        fid.write('   <Domain>\n')
        fid.write('      <Grid Name="{}" GridType="Collection" CollectionType="Spatial">\n'.format(key))
        #fid.write('         <Time Value="%.2f" />\n',params.general.simtime)
        # Get ghost cell setup
        if keepAllGhost:
          nghx,nghy,nghz = self.__nghx,self.__nghy,self.__nghz
        else:
          nghx,nghy,nghz = 1,1,1
        # Loop through chunks
        for iprank in range(0,self.__nproc):
          # Get position
          iproc,jproc,kproc = self.__positionFromRank(iprank,self.__nxp,self.__nyp,self.__nzp)
          # Get basename of current subfile (output file will be in same directory as HDF5 files)
          subfile = os.path.basename(filename+'.{:05d}'.format(iprank))
          # Compute bounds for the local grid
          ib = self.__procGrid[key][0][iproc]
          ie = self.__procGrid[key][1][iproc]
          jb = self.__procGrid[key][2][jproc]
          je = self.__procGrid[key][3][jproc]
          kb = self.__procGrid[key][4][kproc]
          ke = self.__procGrid[key][5][kproc]
          dx = self.__dx[key]
          x0 = self.grid[key][0][ib-1]-nghx*dx
          y0 = self.grid[key][1][jb-1]-nghy*dx
          z0 = self.grid[key][2][kb-1]-nghz*dx
          nxl = ie-ib+1+2*nghx
          nyl = je-jb+1+2*nghy
          nzl = ke-kb+1+2*nghz
          # Write XDMF for this processor
          fid.write('         <Grid Name="rank {:5d} position {:3d},{:3d},{:3d}">\n'.format(iprank,iproc,jproc,kproc))
          fid.write('            <Topology TopologyType="3DCORECTMesh" NumberOfElements="{:d} {:d} {:d}"/>\n'.format(nzl,nyl,nxl))
          fid.write('            <Geometry Origin="" Type="ORIGIN_DXDYDZ">\n')
          fid.write('              <DataItem DataType="Float" Dimensions="3" Format="XML" Precision="8"> {:12f} {:12f} {:12f}</DataItem>\n'.format(z0,y0,x0))
          fid.write('              <DataItem DataType="Float" Dimensions="3" Format="XML" Precision="8"> {:12f} {:12f} {:12f}</DataItem>\n'.format(dx,dx,dx))
          fid.write('            </Geometry>\n')
          fid.write('            <Attribute Name="{}" Dimensions="{:d} {:d} {:d}">\n'.format(key,nzl,nyl,nxl))
          fid.write('            <DataItem Format="HDF5" DataType="Float" Dimensions="{:d} {:d} {:d}">\n'.format(nzl,nyl,nxl))
          fid.write('            {}:/{}/data </DataItem>\n'.format(subfile,key))
          fid.write('            </Attribute>\n')
          fid.write('         </Grid>\n')
        # Write footer and close file
        fid.write('      </Grid>\n')
        fid.write('   </Domain>\n')
        fid.write('</Xdmf> \n')
        fid.close()

    def saveFieldOriginal(self,filename,key,append=False,xdmf=False):
      '''Writes chunks in original processor layout to h5 files'''
      # Determine which chunks are to be written by this processor
      (ip,jp,kp) = self.__positionFromRank(self.__rank,self.__nxp,self.__nyp,self.__nzp)
      ipb1 = self.__ipbeg[ip]
      jpb1 = self.__jpbeg[jp]
      kpb1 = self.__kpbeg[kp]
      ipe1 = ipb1+self.__nxppp-1
      jpe1 = jpb1+self.__nyppp-1
      kpe1 = kpb1+self.__nzppp-1
      # Compute global nx,ny,nz
      nxg = self.__procGrid1[key][1][-1]-self.__procGrid1[key][0][0]+1
      nyg = self.__procGrid1[key][3][-1]-self.__procGrid1[key][2][0]+1
      nzg = self.__procGrid1[key][5][-1]-self.__procGrid1[key][4][0]+1
      # If append flag is set, we add a dataset to the existing file
      if append:
        ioflag = 'a'
      else:
        ioflag = 'w'
      # Now loop through all subchunks and write one file each
      for ip1 in range(ipb1,ipe1+1):
          for jp1 in range(jpb1,jpe1+1):
            for kp1 in range(kpb1,kpe1+1):
              # Determine rank of the chunk to be written, and compose filename
              iprank = self.__rankFromPosition(ip1,jp1,kp1,self.__nxp1,self.__nyp1,self.__nzp1)
              file_chunk = filename+'.{:05d}'.format(iprank)
              # Compute bounds of the subchunk
              ib1 = self.__procGrid1[key][0][ip1]
              ie1 = self.__procGrid1[key][1][ip1]
              jb1 = self.__procGrid1[key][2][jp1]
              je1 = self.__procGrid1[key][3][jp1]
              kb1 = self.__procGrid1[key][4][kp1]
              ke1 = self.__procGrid1[key][5][kp1]
              # Get bounds of the parent chunk
              ib = self.__localChunkBounds[key][0]
              jb = self.__localChunkBounds[key][2]
              kb = self.__localChunkBounds[key][4]
              # Get the range of subdata
              ii_rng = slice(ib1-ib+self.__nghx-1,ie1-ib+self.__nghx+2)
              jj_rng = slice(jb1-jb+self.__nghy-1,je1-jb+self.__nghy+2)
              kk_rng = slice(kb1-kb+self.__nghz-1,ke1-kb+self.__nghz+2)
              # Open the file and write data, then close it
              fid = h5py.File(file_chunk,ioflag)
              if not append:
                fid.create_dataset('rank',data=iprank)
                fid.create_dataset('iproc',data=ip1)
                fid.create_dataset('jproc',data=jp1)
                fid.create_dataset('kproc',data=kp1)
                fid.create_dataset('nxproc',data=self.__nxp1)
                fid.create_dataset('nyproc',data=self.__nyp1)
                fid.create_dataset('nzproc',data=self.__nzp1)
              gid = fid.create_group('/'+key)
              gid.create_dataset('ib',data=ib1)
              gid.create_dataset('ie',data=ie1)
              gid.create_dataset('jb',data=jb1)
              gid.create_dataset('je',data=je1)
              gid.create_dataset('kb',data=kb1)
              gid.create_dataset('ke',data=ke1)
              gid.create_dataset('nxl',data=ie1-ib1+1)
              gid.create_dataset('nyl',data=je1-jb1+1)
              gid.create_dataset('nzl',data=ke1-kb1+1)
              gid.create_dataset('nx',data=nxg)
              gid.create_dataset('ny',data=nyg)
              gid.create_dataset('nz',data=nzg)
              gid.create_dataset('ighost',data=(1,),dtype='i')
              gid.create_dataset('nghostx',data=(1,),dtype='i')
              gid.create_dataset('nghosty',data=(1,),dtype='i')
              gid.create_dataset('nghostz',data=(1,),dtype='i')
              did = gid.create_dataset('data',data=self.field[key][ii_rng,jj_rng,kk_rng].flatten('F'))
              did.attrs['F_CONTIGUOUS'] = 1
              did.attrs['DIM'] = (ie1-ib1+3,je1-jb1+3,ke1-kb1+3)
              fid.close()
      # Create an XDMF file for the field
      if xdmf and self.__rank==0:
        # Construct XDMF filename
        file_xdmf = filename+'_'+key+'.xdmf'
        # Open file and write header
        fid = open(file_xdmf,'w')
        fid.write('<?xml version="1.0"?>\n')
        fid.write('<Xdmf Version="2.0">\n')
        fid.write('   <Domain>\n')
        fid.write('      <Grid Name="{}" GridType="Collection" CollectionType="Spatial">\n'.format(key))
        #fid.write('         <Time Value="%.2f" />\n',params.general.simtime)
        # Get ghost cell setup
        nghx,nghy,nghz = 1,1,1
        # Loop through chunks
        for iprank in range(0,self.__nproc1):
          # Get position
          iproc,jproc,kproc = self.__positionFromRank(iprank,self.__nxp1,self.__nyp1,self.__nzp1)
          # Get basename of current subfile (output file will be in same directory as HDF5 files)
          subfile = os.path.basename(filename+'.{:05d}'.format(iprank))
          # Compute bounds for the local grid
          ib = self.__procGrid1[key][0][iproc]
          ie = self.__procGrid1[key][1][iproc]
          jb = self.__procGrid1[key][2][jproc]
          je = self.__procGrid1[key][3][jproc]
          kb = self.__procGrid1[key][4][kproc]
          ke = self.__procGrid1[key][5][kproc]
          dx = self.__dx[key]
          x0 = self.grid[key][0][ib-1]-nghx*dx
          y0 = self.grid[key][1][jb-1]-nghy*dx
          z0 = self.grid[key][2][kb-1]-nghz*dx
          nxl = ie-ib+1+2*nghx
          nyl = je-jb+1+2*nghy
          nzl = ke-kb+1+2*nghz
          # Write XDMF for this processor
          fid.write('         <Grid Name="rank {:5d} position {:3d},{:3d},{:3d}">\n'.format(iprank,iproc,jproc,kproc))
          fid.write('            <Topology TopologyType="3DCORECTMesh" NumberOfElements="{:d} {:d} {:d}"/>\n'.format(nzl,nyl,nxl))
          fid.write('            <Geometry Origin="" Type="ORIGIN_DXDYDZ">\n')
          fid.write('              <DataItem DataType="Float" Dimensions="3" Format="XML" Precision="8"> {:12f} {:12f} {:12f}</DataItem>\n'.format(z0,y0,x0))
          fid.write('              <DataItem DataType="Float" Dimensions="3" Format="XML" Precision="8"> {:12f} {:12f} {:12f}</DataItem>\n'.format(dx,dx,dx))
          fid.write('            </Geometry>\n')
          fid.write('            <Attribute Name="{}" Dimensions="{:d} {:d} {:d}">\n'.format(key,nzl,nyl,nxl))
          fid.write('            <DataItem Format="HDF5" DataType="Float" Dimensions="{:d} {:d} {:d}">\n'.format(nzl,nyl,nxl))
          fid.write('            {}:/{}/data </DataItem>\n'.format(subfile,key))
          fid.write('            </Attribute>\n')
          fid.write('         </Grid>\n')
        # Write footer and close file
        fid.write('      </Grid>\n')
        fid.write('   </Domain>\n')
        fid.write('</Xdmf> \n')
        fid.close()

    def saveStatistics(self,filename):
      '''Writes all gathered statistics to a h5 file.'''
      if self.__rank==0:
        # Construct file name
        file_h5 = filename+'.h5'
        # Open the file and write data
        fid = h5py.File(file_h5,'w')
        for key in self.spatialMean:
          if self.spatialMean[key] is None:
            continue
          gid = fid.create_group('/'+key)
          did = gid.create_dataset('mean',data=self.spatialMean[key].ravel('C'))
          did.attrs['F_CONTIGUOUS'] = 0
          did.attrs['DIM'] = self.spatialMean[key].shape
          did = gid.create_dataset('meansquare',data=self.spatialMsqr[key].ravel('C'))
          did.attrs['F_CONTIGUOUS'] = 0
          did.attrs['DIM'] = self.spatialMsqr[key].shape
          did = gid.create_dataset('nsample',data=self.spatialNsample[key].ravel('C'))
          did.attrs['F_CONTIGUOUS'] = 0
          did.attrs['DIM'] = self.spatialNsample[key].shape
        # Close the file
        fid.close()

    def saveParticles(self,filename,xdmf=False):
        '''Writes particles to h5 file.'''
        # Select duplicate particles (they have a negative ID)
        li_part = (self.particles[self.col['id'],:]>0)
        # Construct sendbuffer
        sendbuf = self.particles[:,li_part].flatten('F')
        recvbuf = None
        # Rank 0 gathers the length of each incoming array
        sendcounts = np.array(self.__comm.gather(len(sendbuf),root=0))
        if self.__rank==0:
          recvbuf = np.empty(sum(sendcounts))
        # Communicate data
        self.__comm.Gatherv(sendbuf=sendbuf,recvbuf=(recvbuf, sendcounts),root=0)
        # Rank 0 got all particles now, but some duplicates still exist:
        # Ghost particles have been removed, but particles affecting multiple processors not.
        # Remove them by duplicate IDs.
        if self.__rank==0:
          # Reshape to unknown number of particles
          pp = recvbuf.reshape((self.__ncol,-1),order='F')
          # Get duplicate IDs
          idx = np.unique(pp[self.col['id'],:],return_index=True)[1]
          # Slice array
          pp = pp[:,idx]
          # Validate shape
          if pp.shape!=(self.__ncol,self.__npartg):
            raise ValueError('Expected to have {} particles with {} columns, but got {} particles with {} columns.'.format(self.__npartg,self.__ncol,pp.shape[1],pp.shape[0]))
        # Rank 0 got all particles now: Write data to HDF5 file
        if self.__rank==0:
          # Construct file name
          file_h5 = filename+'.h5'
          # Open file and write general information
          fid = h5py.File(file_h5,'w')
          fid.create_dataset('npart',data=self.__npartg)
          fid.create_dataset('ncol',data=self.__ncol)
          # Loop through quantities and write one dataset per quantity
          for key in self.col:
            did = fid.create_dataset(key,data=pp[self.col[key],:].flatten('K'))
          # Close h5 file
          fid.close()
          # If XDMF is to be written, do just that
          if xdmf:
            # Construct XDMF filename
            file_xdmf = filename+'.xdmf'
            # Open file
            fid = open(file_xdmf,'w')
            # Write header
            fid.write('<?xml version="1.0"?>\n')
            fid.write('<Xdmf Version="2.0">\n')
            fid.write('  <Domain>\n')
            # Write grid, i.e. particle positions
            fid.write('    <Grid Name="particles">\n');
            fid.write('      <Topology TopologyType="Polyvertex" NumberOfElements="{:d}" />\n'.format(self.__npartg))
            fid.write('      <Geometry GeometryType="X_Y_Z">\n');
            fid.write('        <DataItem Format="HDF5" Dimensions="{:d}">\n'.format(self.__npartg))
            fid.write('          {}:/x\n'.format(os.path.basename(file_h5)))
            fid.write('        </DataItem>\n')
            fid.write('        <DataItem Format="HDF5" Dimensions="{:d}">\n'.format(self.__npartg))
            fid.write('          {}:/y\n'.format(os.path.basename(file_h5)))
            fid.write('        </DataItem>\n')
            fid.write('        <DataItem Format="HDF5" Dimensions="{:d}">\n'.format(self.__npartg))
            fid.write('          {}:/z\n'.format(os.path.basename(file_h5)))
            fid.write('        </DataItem>\n')
            fid.write('      </Geometry>\n')
            # Write data
            for key in self.col:
              fid.write('      <Attribute Name="{}" Center="Node">\n'.format(key))
              fid.write('        <DataItem Format="HDF5" Dimensions="{:d}">\n'.format(self.__npartg))
              fid.write('          {}:/{}\n'.format(os.path.basename(file_h5),key))
              fid.write('        </DataItem>\n')
              fid.write('      </Attribute>\n')
            # Write footer
            fid.write('    </Grid>\n')
            fid.write('  </Domain>\n')
            fid.write('</Xdmf>\n')
            # Close file
            fid.close()

    def saveGrid(self,filename):
      '''Writes the full grid to a h5 file.'''
      if self.__rank==0:
        # Construct file name
        file_h5 = filename+'.h5'
        # Open the file and write data
        fid = h5py.File(file_h5,'w')
        for key in self.grid:
          gid = fid.create_group('/'+key)
          gid.create_dataset('x',data=self.grid[key][0])
          gid.create_dataset('y',data=self.grid[key][1])
          gid.create_dataset('z',data=self.grid[key][2])
        # Close the file
        fid.close()

    def saveProcGrid(self,filename,original=False):
      '''Writes the full grid to a h5 file.'''
      if self.__rank==0:
        # Construct file name
        file_h5 = filename+'.h5'
        # Open the file and write data
        fid = h5py.File(file_h5,'w')
        for key in self.grid:
          gid = fid.create_group('/'+key)
          if original:
            gid.create_dataset('ibeg',data=self.__procGrid1[key][0])
            gid.create_dataset('iend',data=self.__procGrid1[key][1])
            gid.create_dataset('jbeg',data=self.__procGrid1[key][2])
            gid.create_dataset('jend',data=self.__procGrid1[key][3])
            gid.create_dataset('kbeg',data=self.__procGrid1[key][4])
            gid.create_dataset('kend',data=self.__procGrid1[key][5])
          else:
            gid.create_dataset('ibeg',data=self.__procGrid[key][0])
            gid.create_dataset('iend',data=self.__procGrid[key][1])
            gid.create_dataset('jbeg',data=self.__procGrid[key][2])
            gid.create_dataset('jend',data=self.__procGrid[key][3])
            gid.create_dataset('kbeg',data=self.__procGrid[key][4])
            gid.create_dataset('kend',data=self.__procGrid[key][5])
        # Close the file
        fid.close()

    def freeField(self,key):
      '''Removes a field from memory.'''
      if key not in self.field:
        raise ValueError('Field {} cannot be freed, because it does not exist.'.format(key))
      self.field.pop(key)
      self.grid.pop(key)
      self.localGrid.pop(key)
      self.__dx.pop(key)
      self.__procGrid.pop(key)
      self.__procGrid1.pop(key)
      self.__localChunkBounds.pop(key)
      self.__localChunkSize.pop(key)
      self.__boundarySymmetries.pop(key)

    def differentiateField(self,key,direction):
      # Verify that we got a valid direction and assign numeric value for array indexing
      if direction=='x':
        idir = 0
      elif direction=='y':
        idir = 1
      elif direction=='z':
        idir = 2
      else:
        raise ValueError('Invalid direction: '+direction)
      # Find direction in which to "shift": one of -dx/2, +dx/2
      # The first point should be located on the lower boundary or +dx/2 from it
      xg  = self.grid[key][idir]         # the global grid in the direction where the shift occurs
      xb  = self.domainBounds[2*idir]    # the lower domain boundary of that grid
      dist = xb-xg[0]
      if np.abs(dist)>0.75*self.__dx[key]:
        raise ValueError('Cannot determine shift direction, because first grid points is too far from domain boundary.')
      if np.abs(dist)>0.25*self.__dx[key]:
        signShift = int(np.sign(dist))
      else:
        signShift = +1
      # Create a new field key ("u" differentiated w.r.t "y" will be "uy")
      keyout = key+direction
      shift = [0,0,0]
      shift[idir] = signShift
      symm = self.__deriveSymmetry(self.__boundarySymmetries[key],'d'+direction)
      self.__allocateField(keyout,key,shift=shift,symmetry=symm)
      # Differentiate using second-order finite difference scheme
      idx_dst = [
        slice(self.__nghx,self.__nghx+self.__localChunkSize[key][0]),
        slice(self.__nghy,self.__nghy+self.__localChunkSize[key][1]),
        slice(self.__nghz,self.__nghz+self.__localChunkSize[key][2])
      ]
      idx_hi = idx_dst.copy()
      idx_lo = idx_dst.copy()
      if signShift>0:
        idx_hi[idir] = slice(idx_hi[idir].start+1,idx_hi[idir].stop+1,idx_hi[idir].step)
      else:
        idx_lo[idir] = slice(idx_lo[idir].start-1,idx_lo[idir].stop-1,idx_lo[idir].step)
      f1odx = 1./self.__dx[key]
      self.field[keyout][idx_dst[0],idx_dst[1],idx_dst[2]] = f1odx*(self.field[key][idx_hi[0],idx_hi[1],idx_hi[2]] - self.field[key][idx_lo[0],idx_lo[1],idx_lo[2]])
      # Exchange ghost cells and set boundary conditions
      self.exchangeGhostCells(keyout)
      self.imposeBoundaryConditions(keyout)

    def Qcriterion(self,keepDerivatives=False):
      '''Computes Q criterion on the pressure grid.'''
      # Q = -0.5 (dudx^2 + dvdy^2 + dwdz^2 + 2*dudy*dvdx + 2*dudz*dwdx + 2*dvdz*dwdy)
      # Allocate a new output field on pressure grid
      self.__allocateField('Q','p',shift=(0,0,0))
      # Compute derivatives on after another and add to field
      # dudx*dudx
      if 'ux' not in self.field:
        self.differentiateField('u','x')
      symm = self.__deriveSymmetry(self.__boundarySymmetries['ux'],'mult',symm2=self.__boundarySymmetries['ux'])
      self.__allocateField('tmp','ux',shift=(0,0,0),symmetry=symm)
      self.field['tmp'] += 0.5*np.square(self.field['ux'])
      self.shiftField('tmp',self.__getShiftDirection('tmp','Q'))
      iin,jin,kin,iout,jout,kout = self.__sliceCollocated('tmp','Q')
      self.field['Q'][iout,jout,kout] -= self.field['tmp'][iin,jin,kin]
      self.freeField('tmp')
      if not keepDerivatives:
        self.freeField('ux')
      # dvdy*dvdy
      if 'vy' not in self.field:
        self.differentiateField('v','y')
      symm = self.__deriveSymmetry(self.__boundarySymmetries['vy'],'mult',symm2=self.__boundarySymmetries['vy'])
      self.__allocateField('tmp','vy',shift=(0,0,0),symmetry=symm)
      self.field['tmp'] += 0.5*np.square(self.field['vy'])
      self.shiftField('tmp',self.__getShiftDirection('tmp','Q'))
      iin,jin,kin,iout,jout,kout = self.__sliceCollocated('tmp','Q')
      self.field['Q'][iout,jout,kout] -= self.field['tmp'][iin,jin,kin]
      self.freeField('tmp')
      if not keepDerivatives:
        self.freeField('vy')
      # dwdz*dwdz
      if 'wz' not in self.field:
        self.differentiateField('w','z')
      symm = self.__deriveSymmetry(self.__boundarySymmetries['wz'],'mult',symm2=self.__boundarySymmetries['wz'])
      self.__allocateField('tmp','wz',shift=(0,0,0),symmetry=symm)
      self.field['tmp'] += 0.5*np.square(self.field['wz'])
      self.shiftField('tmp',self.__getShiftDirection('tmp','Q'))
      iin,jin,kin,iout,jout,kout = self.__sliceCollocated('tmp','Q')
      self.field['Q'][iout,jout,kout] -= self.field['tmp'][iin,jin,kin]
      self.freeField('tmp')
      if not keepDerivatives:
        self.freeField('wz')
      # dudy*dvdx
      if 'uy' not in self.field:
        self.differentiateField('u','y')
      if 'vx' not in self.field:
        self.differentiateField('v','x')
      symm = self.__deriveSymmetry(self.__boundarySymmetries['uy'],'mult',symm2=self.__boundarySymmetries['vx'])
      self.__allocateField('tmp','Q',shift=(-1,-1,0),symmetry=symm)
      ii1,jj1,kk1,iout,jout,kout = self.__sliceCollocated('uy','tmp')
      ii2,jj2,kk2,iout,jout,kout = self.__sliceCollocated('vx','tmp')
      self.field['tmp'][iout,jout,kout] += self.field['uy'][ii1,jj1,kk1]*self.field['vx'][ii2,jj2,kk2]
      self.exchangeGhostCells('tmp')
      self.imposeBoundaryConditions('tmp')
      self.shiftField('tmp',self.__getShiftDirection('tmp','Q'))
      iin,jin,kin,iout,jout,kout = self.__sliceCollocated('tmp','Q')
      self.field['Q'][iout,jout,kout] -= self.field['tmp'][iin,jin,kin]
      self.freeField('tmp')
      if not keepDerivatives:
        self.freeField('uy')
        self.freeField('vx')
      # dudz*dwdx
      if 'uz' not in self.field:
        self.differentiateField('u','z')
      if 'wx' not in self.field:
        self.differentiateField('w','x')
      symm = self.__deriveSymmetry(self.__boundarySymmetries['uz'],'mult',symm2=self.__boundarySymmetries['wx'])
      self.__allocateField('tmp','Q',shift=(-1,0,-1),symmetry=symm)
      ii1,jj1,kk1,iout,jout,kout = self.__sliceCollocated('uz','tmp')
      ii2,jj2,kk2,iout,jout,kout = self.__sliceCollocated('wx','tmp')
      self.field['tmp'][iout,jout,kout] += self.field['uz'][ii1,jj1,kk1]*self.field['wx'][ii2,jj2,kk2]
      self.exchangeGhostCells('tmp')
      self.imposeBoundaryConditions('tmp')
      self.shiftField('tmp',self.__getShiftDirection('tmp','Q'))
      iin,jin,kin,iout,jout,kout = self.__sliceCollocated('tmp','Q')
      self.field['Q'][iout,jout,kout] -= self.field['tmp'][iin,jin,kin]
      self.freeField('tmp')
      if not keepDerivatives:
        self.freeField('uz')
        self.freeField('wx')
      # dvdz*dwdy
      if 'vz' not in self.field:
        self.differentiateField('v','z')
      if 'wy' not in self.field:
        self.differentiateField('w','y')
      symm = self.__deriveSymmetry(self.__boundarySymmetries['vz'],'mult',symm2=self.__boundarySymmetries['wy'])
      self.__allocateField('tmp','Q',shift=(0,-1,-1),symmetry=symm)
      ii1,jj1,kk1,iout,jout,kout = self.__sliceCollocated('vz','tmp')
      ii2,jj2,kk2,iout,jout,kout = self.__sliceCollocated('wy','tmp')
      self.field['tmp'][iout,jout,kout] += self.field['vz'][ii1,jj1,kk1]*self.field['wy'][ii2,jj2,kk2]
      self.exchangeGhostCells('tmp')
      self.imposeBoundaryConditions('tmp')
      self.shiftField('tmp',self.__getShiftDirection('tmp','Q'))
      iin,jin,kin,iout,jout,kout = self.__sliceCollocated('tmp','Q')
      self.field['Q'][iout,jout,kout] -= self.field['tmp'][iin,jin,kin]
      self.freeField('tmp')
      if not keepDerivatives:
        self.freeField('vz')
        self.freeField('wy')
      # Exchange ghost cells and set boundary conditions
      self.exchangeGhostCells('Q')
      self.imposeBoundaryConditions('Q')

    def lambda2(self,keepDerivatives=False):
      '''Computes lambda2 vortex criterion on the pressure grid'''
      raise NotImplementedError('TBD')

    def vorticity(self,component,collocate=False,keepDerivatives=False):
      '''Computes a component of the vorticity vector.'''
      # Give a name to the output array
      keyout = 'om'+component
      # Construct the names of the derivatives that we need.
      if component=='x':
        key1,key2,idir = 'wy','vz',0
      elif component=='y':
        key1,key2,idir = 'uz','wx',1
      elif component=='z':
        key1,key2,idir = 'vx','uy',2
      else:
        raise ValueError('Invalid component: '+component)
      # Differentiate if necessary. The two derivatives should end up on the same grid.
      # However, the grid is different for the three components. If "collocated" is specified,
      # the components will be interpolated onto the pressure grid.
      if key1 not in self.field:
        self.differentiateField(key1[0],key1[1])
      if key2 not in self.field:
        self.differentiateField(key2[0],key2[1])
      # Allocate an output array which is shifted compared to the pressure grid. Impose correct symmetries.
      if not np.array_equal(self.__boundarySymmetries[key1],self.__boundarySymmetries[key2]):
        raise ValueError('Symmetry error: THIS SHOULD NOT HAPPEN.')
      symm = self.__boundarySymmetries[key1]
      shift = [-1,-1,-1]
      shift[idir] = 0
      self.__allocateField(keyout,'p',shift=shift,symmetry=symm)
      # Compute vorticity component
      ii1,jj1,kk1,iout,jout,kout = self.__sliceCollocated(key1,keyout)
      ii2,jj2,kk2,iout,jout,kout = self.__sliceCollocated(key2,keyout)
      self.field[keyout][iout,jout,kout] = self.field[key1][ii1,jj1,kk1] - self.field[key2][ii2,jj2,kk2]
      if not keepDerivatives:
        self.freeField(key1)
        self.freeField(key2)
      # Exchange ghost cells and set boundary conditions
      self.exchangeGhostCells(keyout)
      self.imposeBoundaryConditions(keyout)
      # If the fields are supposed to be collocated, shift the field to pressure grid
      if collocate:
        self.shiftField(keyout,self.__getShiftDirection(keyout,'p'))

    def dissipation(self,keepDerivatives=False):
      '''Computes dissipation rate on pressure grid.'''
      # D = 2 [dudx^2 + dvdy^2 + dwdz^2] + (dudy+dvdx)^2 + (dudz+dwdx)^2 + (dvdz+dwdy)^2
      # Viscosity is not multiplied!
      # Allocate a new output field on pressure grid
      self.__allocateField('D','p',shift=(0,0,0))
      # Compute derivatives on after another and add to field
      # dudx*dudx
      if 'ux' not in self.field:
        self.differentiateField('u','x')
      symm = self.__deriveSymmetry(self.__boundarySymmetries['ux'],'mult',symm2=self.__boundarySymmetries['ux'])
      self.__allocateField('tmp','ux',shift=(0,0,0),symmetry=symm)
      self.field['tmp'] += 2.0*np.square(self.field['ux'])
      self.shiftField('tmp',self.__getShiftDirection('tmp','D'))
      iin,jin,kin,iout,jout,kout = self.__sliceCollocated('tmp','D')
      self.field['D'][iout,jout,kout] += self.field['tmp'][iin,jin,kin]
      self.freeField('tmp')
      if not keepDerivatives:
        self.freeField('ux')
      # dvdy*dvdy
      if 'vy' not in self.field:
        self.differentiateField('v','y')
      symm = self.__deriveSymmetry(self.__boundarySymmetries['vy'],'mult',symm2=self.__boundarySymmetries['vy'])
      self.__allocateField('tmp','vy',shift=(0,0,0),symmetry=symm)
      self.field['tmp'] += 2.0*np.square(self.field['vy'])
      self.shiftField('tmp',self.__getShiftDirection('tmp','D'))
      iin,jin,kin,iout,jout,kout = self.__sliceCollocated('tmp','D')
      self.field['D'][iout,jout,kout] += self.field['tmp'][iin,jin,kin]
      self.freeField('tmp')
      if not keepDerivatives:
        self.freeField('vy')
      # dwdz*dwdz
      if 'wz' not in self.field:
        self.differentiateField('w','z')
      symm = self.__deriveSymmetry(self.__boundarySymmetries['wz'],'mult',symm2=self.__boundarySymmetries['wz'])
      self.__allocateField('tmp','wz',shift=(0,0,0),symmetry=symm)
      self.field['tmp'] += 2.0*np.square(self.field['wz'])
      self.shiftField('tmp',self.__getShiftDirection('tmp','D'))
      iin,jin,kin,iout,jout,kout = self.__sliceCollocated('tmp','D')
      self.field['D'][iout,jout,kout] += self.field['tmp'][iin,jin,kin]
      self.freeField('tmp')
      if not keepDerivatives:
        self.freeField('wz')
      # (dudy+dvdx)^2
      if 'uy' not in self.field:
        self.differentiateField('u','y')
      if 'vx' not in self.field:
        self.differentiateField('v','x')
      symm = self.__deriveSymmetry(self.__boundarySymmetries['uy'],'mult',symm2=self.__boundarySymmetries['vx'])
      self.__allocateField('tmp','D',shift=(-1,-1,0),symmetry=symm)
      ii1,jj1,kk1,iout,jout,kout = self.__sliceCollocated('uy','tmp')
      ii2,jj2,kk2,iout,jout,kout = self.__sliceCollocated('vx','tmp')
      self.field['tmp'][iout,jout,kout] += np.square(self.field['uy'][ii1,jj1,kk1]+self.field['vx'][ii2,jj2,kk2])
      self.exchangeGhostCells('tmp')
      self.imposeBoundaryConditions('tmp')
      self.shiftField('tmp',self.__getShiftDirection('tmp','D'))
      iin,jin,kin,iout,jout,kout = self.__sliceCollocated('tmp','D')
      self.field['D'][iout,jout,kout] += self.field['tmp'][iin,jin,kin]
      self.freeField('tmp')
      if not keepDerivatives:
        self.freeField('uy')
        self.freeField('vx')
      # (dudz+dwdx)^2
      if 'uz' not in self.field:
        self.differentiateField('u','z')
      if 'wx' not in self.field:
        self.differentiateField('w','x')
      symm = self.__deriveSymmetry(self.__boundarySymmetries['uz'],'mult',symm2=self.__boundarySymmetries['wx'])
      self.__allocateField('tmp','D',shift=(-1,0,-1),symmetry=symm)
      ii1,jj1,kk1,iout,jout,kout = self.__sliceCollocated('uz','tmp')
      ii2,jj2,kk2,iout,jout,kout = self.__sliceCollocated('wx','tmp')
      self.field['tmp'][iout,jout,kout] += np.square(self.field['uz'][ii1,jj1,kk1]+self.field['wx'][ii2,jj2,kk2])
      self.exchangeGhostCells('tmp')
      self.imposeBoundaryConditions('tmp')
      self.shiftField('tmp',self.__getShiftDirection('tmp','D'))
      iin,jin,kin,iout,jout,kout = self.__sliceCollocated('tmp','D')
      self.field['D'][iout,jout,kout] -= self.field['tmp'][iin,jin,kin]
      self.freeField('tmp')
      if not keepDerivatives:
        self.freeField('uz')
        self.freeField('wx')
      # (dvdz+dwdy)^2
      if 'vz' not in self.field:
        self.differentiateField('v','z')
      if 'wy' not in self.field:
        self.differentiateField('w','y')
      symm = self.__deriveSymmetry(self.__boundarySymmetries['vz'],'mult',symm2=self.__boundarySymmetries['wy'])
      self.__allocateField('tmp','D',shift=(0,-1,-1),symmetry=symm)
      ii1,jj1,kk1,iout,jout,kout = self.__sliceCollocated('vz','tmp')
      ii2,jj2,kk2,iout,jout,kout = self.__sliceCollocated('wy','tmp')
      self.field['tmp'][iout,jout,kout] += np.square(self.field['vz'][ii1,jj1,kk1]+self.field['wy'][ii2,jj2,kk2])
      self.exchangeGhostCells('tmp')
      self.imposeBoundaryConditions('tmp')
      self.shiftField('tmp',self.__getShiftDirection('tmp','D'))
      iin,jin,kin,iout,jout,kout = self.__sliceCollocated('tmp','D')
      self.field['D'][iout,jout,kout] -= self.field['tmp'][iin,jin,kin]
      self.freeField('tmp')
      if not keepDerivatives:
        self.freeField('vz')
        self.freeField('wy')
      # Exchange ghost cells and set boundary conditions
      self.exchangeGhostCells('D')
      self.imposeBoundaryConditions('D')

    def spatialStatistics(self,key,homogeneous=None):
      '''Computes spatial mean and meansquare over homogeneous directions.'''
      # If no homogeneous directions are provided, use periodicity
      if homogeneous is None:
        homogeneous = (self.__xperiodic,self.__yperiodic,self.__zperiodic)
      # Check if there is any homogeneous direction
      if not np.any(homogeneous):
        raise ValueError('Cannot compute statistics, since there is no homogeneous direction.')
      # Each processor computes local statistics:
      # We get three local arrays with len=1 in homogeneous directions and len=nl in inhomogeneous:
      # tmpsum:   sum of local non-nan values
      # tmpssq:   sum of squares of local non-nan values
      # counter:  number of points used for the sum
      tmpsum  = self.field[key][self.__nghx:-self.__nghx,self.__nghy:-self.__nghy,self.__nghz:-self.__nghz]
      tmpssq  = np.square(self.field[key][self.__nghx:-self.__nghx,self.__nghy:-self.__nghy,self.__nghz:-self.__nghz])
      counter = ~np.isnan(tmpsum)
      for idir in range(0,3):
        if homogeneous[idir]:
          counter = np.sum(counter,axis=idir,keepdims=True)
          tmpsum  = np.nansum(tmpsum,axis=idir,keepdims=True)
          tmpssq  = np.nansum(tmpssq,axis=idir,keepdims=True)
      # Reduce the statistics to processors with col/row/pln=0 in homogeneous directions
      proc_layout = np.array(range(0,self.__nproc)).reshape(self.__nxp,self.__nyp,self.__nzp)
      for idir in range(0,3):
        if homogeneous[idir]:
          pos_dst = [self.__icol,self.__jrow,self.__kpln]
          pos_dst[idir] = 0
          rank_dst = self.__rankFromPosition(pos_dst[0],pos_dst[1],pos_dst[2],self.__nxp,self.__nyp,self.__nzp)
          # Create groups for collective communication: get rank of all procs in current direction
          proc_slice = [slice(self.__icol,self.__icol+1),slice(self.__jrow,self.__jrow+1),slice(self.__kpln,self.__kpln+1)]
          proc_slice[idir] = slice(0,None)
          rank_list = proc_layout[proc_slice[0],proc_slice[1],proc_slice[2]].ravel()
          comm = self.__comm.Create(MPI.Group.Incl(self.__comm.Get_group(),rank_list))
          rank = comm.Get_rank()
          # Communicate counter
          if self.__rank==rank_dst:
            #print('Receiving rank',self.__rank,'is known in this comminucator as',rank,'for idir =',idir)
            recvbuf = np.zeros_like(counter)
          else:
            recvbuf = counter
          #print(self.__rank,'Reduce 1: destination',rank_dst,pos_dst)
          comm.Reduce(counter,recvbuf,op=MPI.SUM,root=0)
          counter = recvbuf
          # Communicate tmpsum
          if self.__rank==rank_dst:
            recvbuf = np.zeros_like(tmpsum)
          else:
            recvbuf = tmpsum
          #print(self.__rank,'Reduce 2')
          comm.Reduce(tmpsum,recvbuf,op=MPI.SUM,root=0)
          tmpsum = recvbuf
          # Communicate tmpssq
          if self.__rank==rank_dst:
            recvbuf = np.zeros_like(tmpssq)
          else:
            recvbuf = tmpssq
          comm.Reduce(tmpssq,recvbuf,op=MPI.SUM,root=0)
          tmpssq = recvbuf
          comm.Free()
      # All processors with col/row/pln!=0 in homogeneous direction are done.
      pos = [self.__icol,self.__jrow,self.__kpln]
      for idir in range(0,3):
        if homogeneous[idir] and pos[idir]!=0:
          return
      # The inhomogeneous directions need to communicate their data to rank 0, which then holds
      # the full line or plane. Rank 0 allocates the final array
      if self.__rank==0:
        # Determine size of final array
        nxg = self.__procGrid[key][1][-1]-self.__procGrid[key][0][0]+1
        nyg = self.__procGrid[key][3][-1]-self.__procGrid[key][2][0]+1
        nzg = self.__procGrid[key][5][-1]-self.__procGrid[key][4][0]+1
        dim = [nxg,nyg,nzg]
        for idir in range(0,3):
          if homogeneous[idir]:
            dim[idir] = 1
        # Allocate a nice spot of memory
        self.spatialMean[key]    = np.zeros(dim,dtype=self.__precision)
        self.spatialMsqr[key]    = np.zeros(dim,dtype=self.__precision)
        self.spatialNsample[key] = np.zeros(dim,dtype='i')
        # Create iterators for receive request
        ipend = 0 if homogeneous[0] else self.__nxp-1
        jpend = 0 if homogeneous[1] else self.__nyp-1
        kpend = 0 if homogeneous[2] else self.__nzp-1
        ipiter = range(0,ipend+1)
        jpiter = range(0,jpend+1)
        kpiter = range(0,kpend+1)
        # Iterate through sending procs
        for ip in ipiter:
          for jp in jpiter:
            for kp in kpiter:
              # Determine rank of processor from whom data is received from
              rank_src = self.__rankFromPosition(ip,jp,kp,self.__nxp,self.__nyp,self.__nzp)
              # Determine target array position
              ib,ie = (0,0) if homogeneous[0] else (self.__procGrid[key][0][ip]-1,self.__procGrid[key][1][ip]-1)
              jb,je = (0,0) if homogeneous[1] else (self.__procGrid[key][2][jp]-1,self.__procGrid[key][3][jp]-1)
              kb,ke = (0,0) if homogeneous[2] else (self.__procGrid[key][4][kp]-1,self.__procGrid[key][5][kp]-1)
              # Receive data and put it in a good spot
              if rank_src==0:
                recvbuf = counter
              else:
                recvbuf = np.empty_like(counter)
                self.__comm.Recv(recvbuf,source=rank_src,tag=1)
              self.spatialNsample[key][ib:ie+1,jb:je+1,kb:ke+1] = recvbuf
              if rank_src==0:
                recvbuf = tmpsum
              else:
                recvbuf = np.empty_like(tmpsum)
                self.__comm.Recv(recvbuf,source=rank_src,tag=2)
              self.spatialMean[key][ib:ie+1,jb:je+1,kb:ke+1] = recvbuf
              if rank_src==0:
                recvbuf = tmpssq
              else:
                recvbuf = np.empty_like(tmpssq)
                self.__comm.Recv(recvbuf,source=rank_src,tag=3)
              self.spatialMsqr[key][ib:ie+1,jb:je+1,kb:ke+1] = recvbuf
        # Rank 0 got all the data now. Finalize it!
        self.spatialMean[key] /= self.spatialNsample[key]
        self.spatialMsqr[key] /= self.spatialNsample[key]
      else:
        # All tanks but rank 0 send their data
        self.__comm.Send(counter,dest=0,tag=1)
        self.__comm.Send(tmpsum, dest=0,tag=2)
        self.__comm.Send(tmpssq, dest=0,tag=3)

    def subtractMean(self,key):
      '''Subtract mean from field.'''
      # Check if this operation can be executed
      if self.__rank==0:
        if not key in spatialMean or spatialMean[key] is None:
          raise ValueError('Statistics not available')
      # Broadcast statistics
      self.spatialMean[key] = self.__comm.bcast(self.spatialMean[key],root=0)
      # Get dimensions
      dims = self.spatialMean[key].shape
      # Slice chunk
      if dims[0]==1:
        slx = slice(0,1)
      else:
        slx = slice(self.ib)

    def collocateVelocity(self):
      '''Moves velocity components onto the pressure grid.'''
      raise NotImplementedError('TBD')

    def downsample(self,field,factor=2):
      '''Coarsens a field by integer factor.'''
      raise NotImplementedError('TBD')

    def gaussianFilter(self,key,sigma=(1,1,1),truncate=4.0):
      '''Applies a gaussian filter to a field as in-place operation. Sigma is the std of the filter in terms of grid width.'''
      # Sanity check
      if len(sigma)!=3:
        raise ValueError('Expected one value of sigma for each direction, but only got {}.'.format(len(sigma)))
      # Compute stencil width and check if we got enough ghost cells
      stencil = [0,0,0]
      for ii in range(0,3):
        stencil[ii] = 2*int(truncate*sigma[ii]+0.5)+1
      if self.__nghx<stencil[0] or self.__nghy<stencil[1] or self.__nghz<stencil[2]:
        raise ValueError('Too few ghost cells for stencil: ',stencil,(self.__nghx,self.__nghy,self.__nghz))
      #print(stencil)
      # Run scipy gaussian filter
      self.field[key] = ndimage.gaussian_filter(self.field[key],sigma,truncate=truncate,mode='constant',cval=0.0)
      # Exchange ghost cells and set boundary conditions
      self.exchangeGhostCells(key)
      self.imposeBoundaryConditions(key)

    def maskParticles(self,key,reconstruct=False,cval=np.nan,padding=0.0):
      '''Fills grid points which lie inside of solid phase with values.'''
      # Shortcut to local grid
      xg = self.localGrid[key][0]
      yg = self.localGrid[key][1]
      zg = self.localGrid[key][2]
      # Loop through local particles
      for ipart in range(0,self.__npartl):
        # Slice a box from the field around the particle
        xp,yp,zp,rp = (self.particles[self.col['x'],ipart],
                       self.particles[self.col['y'],ipart],
                       self.particles[self.col['z'],ipart],
                       self.particles[self.col['r'],ipart])
        rp += padding*self.__dx[key]
        # Get coefficients for reconstruction
        if reconstruct:
          coeff_lin = self.particles[self.col[key],ipart]
          if key=='u':
            coeff_rotx =  0.0
            coeff_roty = -self.particles[self.col['oz'],ipart]
            coeff_rotz =  self.particles[self.col['oy'],ipart]
          elif key=='v':
            coeff_rotx =  self.particles[self.col['oz'],ipart]
            coeff_roty =  0.0
            coeff_rotz = -self.particles[self.col['ox'],ipart]
          elif key=='w':
            coeff_rotx = -self.particles[self.col['oy'],ipart]
            coeff_roty =  self.particles[self.col['ox'],ipart]
            coeff_rotz =  0.0
          else:
            coeff_rotx =  0.0
            coeff_roty =  0.0
            coeff_rotz =  0.0
        # Get bounding box of particle
        idx_x = np.nonzero((xg>=xp-rp) & (xg<=xp+rp))[0]
        idx_y = np.nonzero((yg>=yp-rp) & (yg<=yp+rp))[0]
        idx_z = np.nonzero((zg>=zp-rp) & (zg<=zp+rp))[0]
        # Triple for loop
        for ii in range(idx_x[0],idx_x[-1]+1):
          Dx = xg[ii]-xp
          for jj in range(idx_y[0],idx_y[-1]+1):
            Dy = yg[jj]-yp
            for kk in range(idx_z[0],idx_z[-1]+1):
              Dz = zg[kk]-zp
              isInside = Dx*Dx+Dy*Dy+Dz*Dz <= rp*rp
              if isInside:
                if reconstruct:
                  self.field[key][ii,jj,kk] = coeff_lin + coeff_rotx*Dx + coeff_roty*Dy + coeff_rotz*Dz
                else:
                  self.field[key][ii,jj,kk] = cval

    def shiftField(self,key,shift,subsequent=True):
      '''Shifts a field by dx/2 in specified direction as in-place operation. The new values are determined by linear interpolation.'''
      # Sanity check
      if len(shift)!=3:
        raise ValueError('shift must be provided as three-component list.')
      if np.max(shift)>1 or np.min(shift)<-1:
        raise ValueError('Only single shift is allowed.')
      # If all entries of shift are zero, no shifting needs to be done.
      if np.count_nonzero(shift)==0:
        return
      # Update grid
      for idir in range(0,3):
        if shift[idir]!=0:
          self.grid[key][idir]      += shift[idir]*0.5*self.__dx[key]
          self.localGrid[key][idir] += shift[idir]*0.5*self.__dx[key]
      # Perform the interpolation
      if subsequent:
        # Three (at maximum) subsequent 1D interpolations.
        # NOTE: This algorithm requires ghost cell communication for each substep, which can be avoided if we shifted in all directions simultaneously!
        # Construct index range of target
        idx_dst = [
          slice(self.__nghx,self.__nghx+self.__localChunkSize[key][0]),
          slice(self.__nghy,self.__nghy+self.__localChunkSize[key][1]),
          slice(self.__nghz,self.__nghz+self.__localChunkSize[key][2])
        ]
        # Perform subsequent 1D interpolations.
        for idir in range(0,3):
          # If there is no shift in this direction, continue
          if shift[idir]==0:
            continue
          # Perform linear interpolation
          idx_hi = idx_dst.copy()
          idx_lo = idx_dst.copy()
          if shift[idir]>0:
            idx_hi[idir] = slice(idx_hi[idir].start+1,idx_hi[idir].stop+1,idx_hi[idir].step)
          else:
            idx_lo[idir] = slice(idx_lo[idir].start-1,idx_lo[idir].stop-1,idx_lo[idir].step)
          self.field[key][idx_dst[0],idx_dst[1],idx_dst[2]] = 0.5*(self.field[key][idx_hi[0],idx_hi[1],idx_hi[2]] + self.field[key][idx_lo[0],idx_lo[1],idx_lo[2]])
          # Exchange ghost cells and set boundary conditions
          self.exchangeGhostCells(key)
          self.imposeBoundaryConditions(key)
      else:
        # One trilinear interpolation.
        # NOTE: We only access the positions in the array which we really need to. Only one communication is required.
        # Allocate output array: will be assigned to self.field later
        field = np.zeros_like(self.field[key])
        # Determine base shift of indexing
        shiftx = -1 if shift[0]<0 else 0
        shifty = -1 if shift[1]<0 else 0
        shiftz = -1 if shift[2]<0 else 0
        # Determine shift coefficients
        cx = np.abs(shift[0])
        cy = np.abs(shift[1])
        cz = np.abs(shift[2])
        eps = 1e-8
        # Construct index range of target
        ii_dst = slice(self.__nghx,self.__nghx+self.__localChunkSize[key][0])
        jj_dst = slice(self.__nghy,self.__nghy+self.__localChunkSize[key][1])
        kk_dst = slice(self.__nghz,self.__nghz+self.__localChunkSize[key][2])
        # (0,0,0)
        ii_src = slice(self.__nghx+shiftx,self.__nghx+self.__localChunkSize[key][0]+shiftx)
        jj_src = slice(self.__nghy+shifty,self.__nghy+self.__localChunkSize[key][1]+shifty)
        kk_src = slice(self.__nghz+shiftz,self.__nghz+self.__localChunkSize[key][2]+shiftz)
        coeff = 1-0.5*(cx+cy+cz)+0.25*(cx*cy+cx*cz+cy*cz)-0.125*(cx*cy*cz)
        if np.abs(coeff)>eps:
          field[ii_dst,jj_dst,kk_dst] += coeff*self.field[key][ii_src,jj_src,kk_src]
        # (1,0,0)
        ii_src = slice(self.__nghx+shiftx+1,self.__nghx+self.__localChunkSize[key][0]+shiftx+1)
        jj_src = slice(self.__nghy+shifty  ,self.__nghy+self.__localChunkSize[key][1]+shifty  )
        kk_src = slice(self.__nghz+shiftz  ,self.__nghz+self.__localChunkSize[key][2]+shiftz  )
        coeff = 0.5*cx-0.25*(cx*cy+cx*cz)+0.125*(cx*cy*cz)
        if np.abs(coeff)>eps:
          field[ii_dst,jj_dst,kk_dst] += coeff*self.field[key][ii_src,jj_src,kk_src]
        # (0,1,0)
        ii_src = slice(self.__nghx+shiftx  ,self.__nghx+self.__localChunkSize[key][0]+shiftx  )
        jj_src = slice(self.__nghy+shifty+1,self.__nghy+self.__localChunkSize[key][1]+shifty+1)
        kk_src = slice(self.__nghz+shiftz  ,self.__nghz+self.__localChunkSize[key][2]+shiftz  )
        coeff = 0.5*cy-0.25*(cx*cy+cy*cz)+0.125*(cx*cy*cz)
        if np.abs(coeff)>eps:
          field[ii_dst,jj_dst,kk_dst] += coeff*self.field[key][ii_src,jj_src,kk_src]
        # (0,0,1)
        ii_src = slice(self.__nghx+shiftx  ,self.__nghx+self.__localChunkSize[key][0]+shiftx  )
        jj_src = slice(self.__nghy+shifty  ,self.__nghy+self.__localChunkSize[key][1]+shifty  )
        kk_src = slice(self.__nghz+shiftz+1,self.__nghz+self.__localChunkSize[key][2]+shiftz+1)
        coeff = 0.5*cz-0.25*(cx*cz+cy*cz)+0.125*(cx*cy*cz)
        if np.abs(coeff)>eps:
          field[ii_dst,jj_dst,kk_dst] += coeff*self.field[key][ii_src,jj_src,kk_src]
        # (1,1,0)
        ii_src = slice(self.__nghx+shiftx+1,self.__nghx+self.__localChunkSize[key][0]+shiftx+1)
        jj_src = slice(self.__nghy+shifty+1,self.__nghy+self.__localChunkSize[key][1]+shifty+1)
        kk_src = slice(self.__nghz+shiftz  ,self.__nghz+self.__localChunkSize[key][2]+shiftz  )
        coeff = 0.25*(cx*cy)-0.125*(cx*cy*cz)
        if np.abs(coeff)>eps:
          field[ii_dst,jj_dst,kk_dst] += coeff*self.field[key][ii_src,jj_src,kk_src]
        # (1,0,1)
        ii_src = slice(self.__nghx+shiftx+1,self.__nghx+self.__localChunkSize[key][0]+shiftx+1)
        jj_src = slice(self.__nghy+shifty  ,self.__nghy+self.__localChunkSize[key][1]+shifty  )
        kk_src = slice(self.__nghz+shiftz+1,self.__nghz+self.__localChunkSize[key][2]+shiftz+1)
        coeff = 0.25*(cx*cz)-0.125*(cx*cy*cz)
        if np.abs(coeff)>eps:
          field[ii_dst,jj_dst,kk_dst] += coeff*self.field[key][ii_src,jj_src,kk_src]
        # (0,1,1)
        ii_src = slice(self.__nghx+shiftx  ,self.__nghx+self.__localChunkSize[key][0]+shiftx  )
        jj_src = slice(self.__nghy+shifty+1,self.__nghy+self.__localChunkSize[key][1]+shifty+1)
        kk_src = slice(self.__nghz+shiftz+1,self.__nghz+self.__localChunkSize[key][2]+shiftz+1)
        coeff = 0.25*(cy*cz)-0.125*(cx*cy*cz)
        if np.abs(coeff)>eps:
          field[ii_dst,jj_dst,kk_dst] += coeff*self.field[key][ii_src,jj_src,kk_src]
        # (1,1,1)
        ii_src = slice(self.__nghx+shiftx+1,self.__nghx+self.__localChunkSize[key][0]+shiftx+1)
        jj_src = slice(self.__nghy+shifty+1,self.__nghy+self.__localChunkSize[key][1]+shifty+1)
        kk_src = slice(self.__nghz+shiftz+1,self.__nghz+self.__localChunkSize[key][2]+shiftz+1)
        coeff = 0.125*(cx*cy*cz)
        if np.abs(coeff)>eps:
          field[ii_dst,jj_dst,kk_dst] += coeff*self.field[key][ii_src,jj_src,kk_src]
        # Assign to field
        self.field[key] = field
        # Exchange ghost cells and set boundary conditions
        self.exchangeGhostCells(key)
        self.imposeBoundaryConditions(key)

    def exchangeGhostCells(self,key):
      '''Communicates all ghost cells of specified field'''
      # Trigger non-blocking communication:
      # Communicate faces (6 faces)
      self.__communicateGhostCells(key,(-1,0,0)) # left
      self.__communicateGhostCells(key,(+1,0,0)) # right
      self.__communicateGhostCells(key,(0,-1,0)) # down
      self.__communicateGhostCells(key,(0,+1,0)) # up
      self.__communicateGhostCells(key,(0,0,-1)) # front
      self.__communicateGhostCells(key,(0,0,+1)) # back
      # Communicate edges (12 edges)
      self.__communicateGhostCells(key,(-1,-1,0)) # left,down
      self.__communicateGhostCells(key,(-1,0,-1)) # left,front
      self.__communicateGhostCells(key,(-1,+1,0)) # left,up
      self.__communicateGhostCells(key,(-1,0,+1)) # left,back
      self.__communicateGhostCells(key,(+1,-1,0)) # right,down
      self.__communicateGhostCells(key,(+1,0,-1)) # right,front
      self.__communicateGhostCells(key,(+1,+1,0)) # right,up
      self.__communicateGhostCells(key,(+1,0,+1)) # right,back
      self.__communicateGhostCells(key,(0,-1,-1)) # down,front
      self.__communicateGhostCells(key,(0,-1,+1)) # down,back
      self.__communicateGhostCells(key,(0,+1,-1)) # up,front
      self.__communicateGhostCells(key,(0,+1,+1)) # up,back
      # Communicate corners (8 corners)
      self.__communicateGhostCells(key,(-1,-1,-1)) # left,down,front
      self.__communicateGhostCells(key,(-1,-1,+1)) # left,down,back
      self.__communicateGhostCells(key,(-1,+1,-1)) # left,up,front
      self.__communicateGhostCells(key,(-1,+1,+1)) # left,up,back
      self.__communicateGhostCells(key,(+1,-1,-1)) # right,down,front
      self.__communicateGhostCells(key,(+1,-1,+1)) # right,down,back
      self.__communicateGhostCells(key,(+1,+1,-1)) # right,up,front
      self.__communicateGhostCells(key,(+1,+1,+1)) # right,up,back

    def imposeBoundaryConditions(self,key):
      '''Imposes symmetry boundary conditions on each non-periodic wall'''
      self.__symmetrizeWall(key,(-1,0,0))
      self.__symmetrizeWall(key,(+1,0,0))
      self.__symmetrizeWall(key,(0,-1,0))
      self.__symmetrizeWall(key,(0,+1,0))
      self.__symmetrizeWall(key,(0,0,-1))
      self.__symmetrizeWall(key,(0,0,+1))

    def printMemoryUsage(self):
      '''Print estimation of memory usage (max over all processors)'''
      mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024.
      mem = self.__comm.reduce(sendobj=mem,op=MPI.MAX,root=0)
      if self.__rank==0:
        print('Current memory usage: {:8.2f} MiB'.format(mem))

    def printMemoryUsageField(self,key):
      '''Print estimation of memory usage per allocated field.'''
      bytesNumpy = np.dtype(self.__precision).itemsize
      nxl = self.__localChunkSize[key][0]+2*self.__nghx
      nyl = self.__localChunkSize[key][1]+2*self.__nghy
      nzl = self.__localChunkSize[key][2]+2*self.__nghz
      mem = nxl*nyl*nzl*bytesNumpy/1024.**2
      mem = self.__comm.reduce(sendobj=mem,op=MPI.MAX,root=0)
      if self.__rank==0:
        print('Memory usage of {}: {:8.2f} MiB'.format(key,mem))

    def __setBoundaryConditions(self,flowType):
      '''Sets the symmetries for ghost cells behind the wall'''
      # No-slip boundary    Free-slip boundary:
      #   u  -> -u            u  ->  u
      #   v  -> -v            v  -> -v
      #   w  -> -w            w  ->  w
      #   p  ->  p            p  ->  p
      self.__boundarySymmetries = {'u':  np.zeros((3,3,3),dtype='i'),
                                   'v':  np.zeros((3,3,3),dtype='i'),
                                   'w':  np.zeros((3,3,3),dtype='i'),
                                   'p':  np.zeros((3,3,3),dtype='i'),
                                   's1': np.zeros((3,3,3),dtype='i'),
                                   }
      if flowType in ('channel','cf'):
          self.__xperiodic = True
          self.__yperiodic = False
          self.__zperiodic = True
          self.__boundarySymmetries['u'][1,0,1]  = -1 # u -> -u at no-slip wall, since u_NS = 0
          self.__boundarySymmetries['u'][1,2,1]  = -1 # u -> -u at no-slip wall, since u_NS = 0
          self.__boundarySymmetries['v'][1,0,1]  =  1 # v ->  v at no-slip wall, since v,y_NS = 0 (no slip condition will be numerically wrong!)
          self.__boundarySymmetries['v'][1,2,1]  =  1 # v ->  v at no-slip wall, since v,y_NS = 0 (no slip condition will be numerically wrong!)
          self.__boundarySymmetries['w'][1,0,1]  = -1 # w -> -w at no-slip wall, since w_NS = 0
          self.__boundarySymmetries['w'][1,2,1]  = -1 # w -> -w at no-slip wall, since w_NS = 0
          self.__boundarySymmetries['p'][1,0,1]  =  1
          self.__boundarySymmetries['p'][1,2,1]  =  1
          self.__boundarySymmetries['s1'][1,0,1] =  1 # this depends on the actual BC, here zero gradient is imposed
          self.__boundarySymmetries['s1'][1,2,1] =  1 # this depends on the actual BC, here zero gradient is imposed
      elif flowType in ('open-channel','ocf'):
          self.__xperiodic = True
          self.__yperiodic = False
          self.__zperiodic = True
          self.__boundarySymmetries['u'][1,0,1]  = -1 # u -> -u at no-slip wall, since u_NS = 0
          self.__boundarySymmetries['u'][1,2,1]  =  1 # u ->  u at free-slip wall, since u,y_FS = 0
          self.__boundarySymmetries['v'][1,0,1]  =  1 # v ->  v at no-slip wall, since v,y_NS = 0 (no slip condition will be numerically wrong!)
          self.__boundarySymmetries['v'][1,2,1]  = -1 # v -> -v at free-slip wall, since v_FS = 0
          self.__boundarySymmetries['w'][1,0,1]  = -1 # w -> -w at no-slip wall, since w_NS = 0
          self.__boundarySymmetries['w'][1,2,1]  =  1 # w -> -w at no-slip wall, since w_NS = 0
          self.__boundarySymmetries['p'][1,0,1]  =  1
          self.__boundarySymmetries['p'][1,2,1]  =  1
          self.__boundarySymmetries['s1'][1,0,1] =  1 # this depends on the actual BC, here zero gradient is imposed
          self.__boundarySymmetries['s1'][1,2,1] =  1 # this depends on the actual BC, here zero gradient is imposed
      elif flowType in ('homiso','hit'):
          self.__xperiodic = True
          self.__yperiodic = True
          self.__zperiodic = True
      else:
          raise ValueError('Invalid flow type: '+flowType)

    def __symmetrizeWall(self,key,positionBd):
      # Sanity check
      if np.max(positionBd)>1 or np.min(positionBd)<-1:
        raise ValueError('symmetrizeWall: invalid neighbor position {}'.format(positionBd))
      if np.count_nonzero(positionBd)!=1:
        raise ValueError('symmetrizeWall: only face direction is accepted (no edges or corners)')
      inghbr =  positionBd[0]+1
      jnghbr =  positionBd[1]+1
      knghbr =  positionBd[2]+1
      eps = 1e-3*self.__dx[key]
      if self.__nghbr[inghbr,jnghbr,knghbr] is None:
        # x-direction
        if positionBd[0]==-1:                  # lower boundary
          xbd = self.domainBounds[0]
          xgl = self.localGrid[key][0]
          idx  = np.where(xgl>=xbd-eps)[0][0]  # index of first point within domain
          dist = np.abs(xgl[idx]-xbd)          # distance first point to wall: should be either 0 or dx/2
          if dist>=0.25*self.__dx[key]:        # no point on boundary
            ii_dst = slice(0,idx,1)
            ii_src = slice(2*idx-1,idx-1,-1)
          else:                                # point on boundary
            ii_dst = slice(0,idx,1)
            ii_src = slice(2*idx,idx,-1)
          self.field[key][ii_dst,:,:] = self.__boundarySymmetries[key][inghbr,jnghbr,knghbr]*self.field[key][ii_src,:,:]
        elif positionBd[0]==1:                 # upper boundary
          xbd = self.domainBounds[1]
          xgl = self.localGrid[key][0]
          idx  = np.where(xgl<=xbd+eps)[0][-1] # index of last point within domain
          dist = np.abs(xgl[idx]-xbd)          # distance first point to wall: should be either 0 or dx/2
          if dist>=0.25*self.__dx[key]:        # no point on boundary
            ii_dst = slice(idx+1,len(xgl),1)
            ii_src = slice(idx,2*idx+1-len(xgl),-1)
          else:                                # point on boundary
            ii_dst = slice(idx+1,len(xgl),1)
            ii_src = slice(idx-1,2*idx-len(xgl),-1)
          self.field[key][ii_dst,:,:] = self.__boundarySymmetries[key][inghbr,jnghbr,knghbr]*self.field[key][ii_src,:,:]
        # y-direction
        if positionBd[1]==-1:                  # lower boundary
          ybd = self.domainBounds[2]
          ygl = self.localGrid[key][1]
          idx  = np.where(ygl>=ybd-eps)[0][0]  # index of first point within domain
          dist = np.abs(ygl[idx]-ybd)          # distance first point to wall: should be either 0 or dx/2
          if dist>=0.25*self.__dx[key]:        # no point on boundary
            jj_dst = slice(0,idx,1)
            jj_src = slice(2*idx-1,idx-1,-1)
          else:                                # point on boundary
            jj_dst = slice(0,idx,1)
            jj_src = slice(2*idx,idx,-1)
          self.field[key][:,jj_dst,:] = self.__boundarySymmetries[key][inghbr,jnghbr,knghbr]*self.field[key][:,jj_src,:]
        elif positionBd[1]==1:                 # upper boundary
          ybd = self.domainBounds[3]
          ygl = self.localGrid[key][1]
          idx  = np.where(ygl<=ybd+eps)[0][-1] # index of last point within domain
          dist = np.abs(ygl[idx]-ybd)          # distance first point to wall: should be either 0 or dx/2
          if dist>=0.25*self.__dx[key]:        # no point on boundary
            jj_dst = slice(idx+1,len(ygl),1)
            jj_src = slice(idx,2*idx+1-len(ygl),-1)
          else:                                # point on boundary
            jj_dst = slice(idx+1,len(ygl),1)
            jj_src = slice(idx-1,2*idx-len(ygl),-1)
          self.field[key][:,jj_dst,:] = self.__boundarySymmetries[key][inghbr,jnghbr,knghbr]*self.field[key][:,jj_src,:]
        # z-direction
        if positionBd[2]==-1:                  # lower boundary
          zbd = self.domainBounds[4]
          zgl = self.localGrid[key][2]
          idx  = np.where(zgl>=zbd-eps)[0][0]  # index of first point within domain
          dist = np.abs(zgl[idx]-zbd)          # distance first point to wall: should be either 0 or dx/2
          if dist>=0.25*self.__dx[key]:        # no point on boundary
            kk_dst = slice(0,idx,1)
            kk_src = slice(2*idx-1,idx-1,-1)
          else:                                # point on boundary
            kk_dst = slice(0,idx,1)
            kk_src = slice(2*idx,idx,-1)
          self.field[key][:,:,kk_dst] = self.__boundarySymmetries[key][inghbr,jnghbr,knghbr]*self.field[key][:,:,kk_src]
        elif positionBd[2]==1:                 # upper boundary
          zbd = self.domainBounds[5]
          zgl = self.localGrid[key][2]
          idx  = np.where(zgl<=zbd+eps)[0][-1] # index of last point within domain
          dist = np.abs(zgl[idx]-zbd)          # distance first point to wall: should be either 0 or dx/2
          if dist>=0.25*self.__dx[key]:        # no point on boundary
            kk_dst = slice(idx+1,len(zgl),1)
            kk_src = slice(idx,2*idx+1-len(zgl),-1)
          else:                                # point on boundary
            kk_dst = slice(idx+1,len(zgl),1)
            kk_src = slice(idx-1,2*idx-len(zgl),-1)
          self.field[key][:,:,kk_dst] = self.__boundarySymmetries[key][inghbr,jnghbr,knghbr]*self.field[key][:,:,kk_src]

    def __initializeNeighbors(self):
      '''Determine neighboring processors and save them in an array'''
      nghbr = np.empty((3,3,3),dtype='object')
      # wrap-around x
      ipl = self.__icol-1
      if ipl<0:
        if self.__xperiodic:
          ipl = self.__nxp-1
        else:
          ipl = None
      ipr = self.__icol+1
      if ipr>self.__nxp-1:
        if self.__xperiodic:
          ipr = 0
        else:
          ipr = None
      inghbr = (ipl,self.__icol,ipr)
      # wrap-around y
      jpl = self.__jrow-1
      if jpl<0:
        if self.__yperiodic:
          jpl = self.__nyp-1
        else:
          jpl = None
      jpr = self.__jrow+1
      if jpr>self.__nyp-1:
        if self.__yperiodic:
          jpr = 0
        else:
          jpr = None
      jnghbr = (jpl,self.__jrow,jpr)
      # wrap-around z
      kpl = self.__kpln-1
      if kpl<0:
        if self.__zperiodic:
          kpl = self.__nzp-1
        else:
          kpl = None
      kpr = self.__kpln+1
      if kpr>self.__nzp-1:
        if self.__zperiodic:
          kpr = 0
        else:
          kpr = None
      knghbr = (kpl,self.__kpln,kpr)
      # Construct array of neighbors
      for ip in range(0,3):
        for jp in range(0,3):
          for kp in range(0,3):
            # Assign rank to neighbor array
            if inghbr[ip] is None or jnghbr[jp] is None or knghbr[kp] is None:
              nghbr[ip,jp,kp] = None
            else:
              nghbr[ip,jp,kp] = self.__rankFromPosition(inghbr[ip],jnghbr[jp],knghbr[kp],self.__nxp,self.__nyp,self.__nzp)
      # Save neighbors as class variable
      self.__nghbr = nghbr

    def __communicateGhostCells(self,key,positionDst):
      '''Triggers communication of ghost cells'''
      # Sanity check
      if np.max(positionDst)>1 or np.min(positionDst)<-1:
        raise ValueError('communicateGhostCells: invalid neighbor position {}'.format(positionDst))
      # [send/recv] get the rank of the neighbor where data is to be sent to
      # The position is passed as values -1,0,+1, but need to be converted to array indices
      ip_dst =  positionDst[0]+1
      jp_dst =  positionDst[1]+1
      kp_dst =  positionDst[2]+1
      ip_src = -positionDst[0]+1
      jp_src = -positionDst[1]+1
      kp_src = -positionDst[2]+1
      rank_dst = self.__nghbr[ip_dst,jp_dst,kp_dst]
      rank_src = self.__nghbr[ip_src,jp_src,kp_src]
      # [send/recv] create a tag
      tag = ip_dst*100+jp_dst*10+kp_dst
      # [send/recv] get the indices of data to be sent/received
      (nxl,nyl,nzl) = self.__localChunkSize[key]
      if positionDst[0]==-1:
        ii_src = slice(self.__nghx,2*self.__nghx)
        ii_dst = slice(self.__nghx+nxl,2*self.__nghx+nxl)
      elif positionDst[0]==0:
        ii_src = slice(self.__nghx,self.__nghx+nxl)
        ii_dst = slice(self.__nghx,self.__nghx+nxl)
      elif positionDst[0]==1:
        ii_src = slice(nxl,nxl+self.__nghx)
        ii_dst = slice(0,self.__nghx)
      else:
        raise ValueError('Invalid direction for ghost cell exchange: {}'.format(positionDst[0]))
      if positionDst[1]==-1:
        jj_src = slice(self.__nghy,2*self.__nghy)
        jj_dst = slice(self.__nghy+nyl,2*self.__nghy+nyl)
      elif positionDst[1]==0:
        jj_src = slice(self.__nghy,self.__nghy+nyl)
        jj_dst = slice(self.__nghy,self.__nghy+nyl)
      elif positionDst[1]==1:
        jj_src = slice(nyl,nyl+self.__nghy)
        jj_dst = slice(0,self.__nghy)
      else:
        raise ValueError('Invalid direction for ghost cell exchange: {}'.format(positionDst[1]))
      if positionDst[2]==-1:
        kk_src = slice(self.__nghz,2*self.__nghz)
        kk_dst = slice(self.__nghz+nzl,2*self.__nghz+nzl)
      elif positionDst[2]==0:
        kk_src = slice(self.__nghz,self.__nghz+nzl)
        kk_dst = slice(self.__nghz,self.__nghz+nzl)
      elif positionDst[2]==1:
        kk_src = slice(nzl,nzl+self.__nghz)
        kk_dst = slice(0,self.__nghz)
      else:
        raise ValueError('Invalid direction for ghost cell exchange: {}'.format(positionDst[2]))
      # [send/recv] Create a list for requests
      reqsend = None
      reqrecv = None
      # [send] now send the data to the neighbor, but only if there is one!
      # Communication must be done non-blocking, and we can use upper-case routines since this is a numpy array
      if rank_dst is not None:
        sendbuf = np.ascontiguousarray(self.field[key][ii_src,jj_src,kk_src])
        reqsend = self.__comm.Isend(sendbuf,dest=rank_dst,tag=tag)
      # [recv] the corresponding receive: results are stored in a buffer which will be assigned to the parent array later
      if rank_src is not None:
        recvbuf = np.zeros((ii_dst.stop-ii_dst.start,jj_dst.stop-jj_dst.start,kk_dst.stop-kk_dst.start),dtype=self.__precision)
        reqrecv = self.__comm.Irecv(recvbuf,source=rank_src,tag=tag)
      # [recv] wait for data to be received
      if reqrecv is not None:
        reqrecv.wait()
        self.field[key][ii_dst,jj_dst,kk_dst] = recvbuf
      # [send] wait for data to be sent
      if reqsend is not None:
        reqsend.wait()

    def __allocateField(self,key,keytemplate,shift=[0,0,0],symmetry=None):
      '''Alocates a new field with name key. Copy grid and processor layout from keytemplate
         and shift it by "shift". By default, the symmetries from the template are copied.
         If this is not desired, they need to be passed.'''
      # Sanity check
      if len(shift)!=3:
        raise ValueError('Direction must be provided as three-component list.')
      if np.max(shift)>1 or np.min(shift)<-1:
        raise ValueError('Only single shift is allowed.')
      # Assign new processor grid
      self.__procGrid[key]         = [ii.copy() for ii in self.__procGrid[keytemplate]]
      self.__procGrid1[key]        = [ii.copy() for ii in self.__procGrid1[keytemplate]]
      self.__localChunkBounds[key] = [ii.copy() for ii in self.__localChunkBounds[keytemplate]]
      self.__localChunkSize[key]   = [ii.copy() for ii in self.__localChunkSize[keytemplate]]
      # Assign the new grid
      self.__dx[key]             = self.__dx[keytemplate]
      self.grid[key]             = [ii.copy() for ii in self.grid[keytemplate]]
      self.localGrid[key]        = [ii.copy() for ii in self.localGrid[keytemplate]]
      # Allocate memory for the new field
      self.field[key] = np.zeros(
        (self.__localChunkSize[key][0]+2*self.__nghx,
         self.__localChunkSize[key][1]+2*self.__nghy,
         self.__localChunkSize[key][2]+2*self.__nghz),dtype=self.__precision
      )
      # Shift grid
      for idir in range(0,3):
        if shift[idir]!=0:
          self.grid[key][idir] += shift[idir]*0.5*self.__dx[key]
          self.localGrid[key][idir] += shift[idir]*0.5*self.__dx[key]
      # Assign symmetries
      if symmetry is None:
        self.__boundarySymmetries[key] = self.__boundarySymmetries[keytemplate].copy()
      else:
        self.__boundarySymmetries[key] = symmetry

    def __getShiftDirection(self,key1,key2):
      '''Get a tuple specifying the direction to shift field key1 onto key2.'''
      # Sanity check
      if np.abs(self.__dx[key1]-self.__dx[key2])>1e-8:
        raise ValueError('Cannot determine shift direction, because grid width does not match.')
      # Get first point of both grids
      x1 = (self.grid[key1][0][0], self.grid[key1][1][0], self.grid[key1][2][0])
      x2 = (self.grid[key2][0][0], self.grid[key2][1][0], self.grid[key2][2][0])
      # Determine shift direction
      shift = [0,0,0]
      dx = self.__dx[key1]
      for idir in range(0,3):
        dist = x2[idir]-x1[idir]
        if np.abs(dist)>0.75*dx:
          raise ValueError('Cannot determine shift direction, because points are more than 0.5dx apart.')
        if np.abs(dist)>0.25*dx:
          shift[idir] = int(np.sign(dist))
      return shift

    def __sliceCollocated(self,key1,key2):
      '''Get slice object for subdomain. Useful when two fields are to be multiplied.'''
      # Sanity check
      if np.abs(self.__dx[key1]-self.__dx[key2])>1e-8:
        raise ValueError('Cannot create a collocated slice, because grid width does not match.')
      # Prepare some work arrays
      eps = 1e-3*self.__dx[key1]
      ngh = (self.__nghx,self.__nghy,self.__nghz)
      sli = [None,None,None,None,None,None]
      # Loop through directions
      for idir in range(0,3):
        # Find index in key1 which matched first non-ghost cell in key2
        x1 = self.localGrid[key1][idir]
        x2 = self.localGrid[key2][idir]
        idx = np.argmin(np.abs(x1-x2[ngh[idir]]))
        # Validate that the points are collocated
        if np.abs(x1[idx]-x2[ngh[idir]])>eps:
          raise ValueError('Cannot create collocated slice, because grids are not collocated.')
        # Check if we got enough grid points in key1
        if x1[idx]+self.__localChunkSize[key2][idir]>self.__localChunkSize[key1][idir]+2*ngh[idir]:
          raise ValueError('Cannot create collocated slice, because too few local grid points are available.')
        # Create slices
        sli[idir]   = slice(idx,idx+self.__localChunkSize[key2][idir])
        sli[idir+3] = slice(ngh[idir],ngh[idir]+self.__localChunkSize[key2][idir])
      return sli

    # Some static helper functions
    @staticmethod
    def __rankFromPosition(ip,jp,kp,nxp,nyp,nzp):
      return ip*nyp*nzp+jp*nzp+kp

    @staticmethod
    def __positionFromRank(rank,nxp,nyp,nzp):
      ip = rank//(nyp*nzp)
      jp = (rank//nzp)%nyp
      kp = rank%nzp
      return (ip,jp,kp)

    @staticmethod
    def __deriveSymmetry(symm1,operation,symm2=None):
      symm = symm1.copy()
      if operation=='dx':
        symm[0,1,1] = -symm1[0,1,1]
        symm[2,1,1] = -symm1[2,1,1]
      elif operation=='dy':
        symm[1,0,1] = -symm1[1,0,1]
        symm[1,2,1] = -symm1[1,2,1]
      elif operation=='dz':
        symm[1,1,0] = -symm1[1,1,0]
        symm[1,1,2] = -symm1[1,1,2]
      elif operation=='mult':
        if symm2 is None:
          raise ValueError('Second operand must be provided for multiplication.')
        symm = np.multiply(symm1,symm2)
      else:
        raise ValueError('Invalid operation: ',operation)
      return symm

# Utility functions
def gaussianFilterStencilWidth(sigma=(1,1,1),truncate=4.0):
  if len(sigma)!=3:
        raise ValueError('Expected one value of sigma for each direction, but only got {}.'.format(len(sigma)))
  # Compute stencil width
  stencil = [0,0,0]
  for ii in range(0,3):
    stencil[ii] = 2*int(truncate*sigma[ii]+0.5)+1
  return stencil