! Copyright (c) 2011-2013 Manuel Hasert <m.hasert@grs-sim.de>
! Copyright (c) 2011-2013 Simon Zimny <s.zimny@grs-sim.de>
! Copyright (c) 2011, 2013, 2016-2017, 2019 Harald Klimach <harald.klimach@uni-siegen.de>
! Copyright (c) 2011-2016, 2019-2020 Kannan Masilamani <kannan.masilamani@uni-siegen.de>
! Copyright (c) 2011 Jan Hueckelheim <j.hueckelheim@grs-sim.de>
! Copyright (c) 2012, 2014-2016 Jiaxing Qi <jiaxing.qi@uni-siegen.de>
! Copyright (c) 2012, 2014-2015 Kartik Jain <kartik.jain@uni-siegen.de>
! Copyright (c) 2016-2017 Tobias Schneider <tobias1.schneider@student.uni-siegen.de>
! Copyright (c) 2018 Raphael Haupt <raphael.haupt@uni-siegen.de>
!
! Copyright (c) 2020 Peter Vitt <peter.vitt2@uni-siegen.de>
! Redistribution and use in source and binary forms, with or without
! modification, are permitted provided that the following conditions are met:
!
! 1. Redistributions of source code must retain the above copyright notice,
! this list of conditions and the following disclaimer.
!
! 2. Redistributions in binary form must reproduce the above copyright notice,
! this list of conditions and the following disclaimer in the documentation
! and/or other materials provided with the distribution.
!
! THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF SIEGEN “AS IS” AND ANY EXPRESS
! OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
! OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
! IN NO EVENT SHALL UNIVERSITY OF SIEGEN OR CONTRIBUTORS BE LIABLE FOR ANY
! DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
! (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
! ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
! (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
! SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
! ****************************************************************************** !
!> author: Manuel Hasert
!! author: Kannan Masilamani
!! author: Jiaxing Qi
!! Routines and parameter definitions for the standard D3Q19 model
?? include 'header/lbm_macros.inc'
?? include 'header/lbm_d3q19Macros.inc'
module mus_d3q19_module
  use iso_c_binding, only: c_f_pointer

  ! include treelm modules
  use env_module,            only: rk
  use tem_compileconf_module, only: vlen
  use tem_varSys_module,     only: tem_varSys_type, tem_varSys_op_type
  use tem_param_module,      only: div1_3, div1_6, div1_36, div1_8, div3_4h,  &
    &                              div2_8, cs2inv, cs4inv, t2cs2inv,t2cs4inv, &
    &                              rho0, rho0Inv
  use tem_dyn_array_module,  only: PositionOfVal

  ! include musubi modules
  use mus_field_prop_module,    only: mus_field_prop_type
  use mus_scheme_layout_module, only: mus_scheme_layout_type
  use mus_param_module,         only: mus_param_type
  use mus_varSys_module,        only: mus_varSys_data_type
  use mus_derVarPos_module,     only: mus_derVarPos_type

  implicit none

  private

  public :: bgk_advRel_d3q19

  public :: bgk_advRel_d3q19_incomp

  public :: trt_advRel_d3q19
  public :: trt_advRel_d3q19_incomp

  public :: bgk_advRel_d3q19_block

  ! =============================================================================
  ! D3Q19 flow model
  ! =============================================================================
  !> Definition of the discrete velocity set

  ! integer,parameter :: block = 32
  integer,parameter :: QQ   = 19  !< number of pdf directions

  integer,parameter :: qN00 = 1   !< west             x-
  integer,parameter :: q0N0 = 2   !< south            y-
  integer,parameter :: q00N = 3   !< bottom           z-
  integer,parameter :: q100 = 4   !< east             x+
  integer,parameter :: q010 = 5   !< north            y+
  integer,parameter :: q001 = 6   !< top              z+
  integer,parameter :: q0NN = 7   !<                  z-,y-
  integer,parameter :: q0N1 = 8   !<                  z+,y-
  integer,parameter :: q01N = 9   !<                  z-,y+
  integer,parameter :: q011 = 10  !<                  z+,y+
  integer,parameter :: qN0N = 11  !<                  x-,z-
  integer,parameter :: q10N = 12  !<                  x+,z-
  integer,parameter :: qN01 = 13  !<                  x-,z+
  integer,parameter :: q101 = 14  !<                  x+,z+
  integer,parameter :: qNN0 = 15  !<                  y-,x-
  integer,parameter :: qN10 = 16  !<                  y+,x-
  integer,parameter :: q1N0 = 17  !<                  y-,x+
  integer,parameter :: q110 = 18  !<                  y+,x+
  integer,parameter :: q000 = 19  !< rest density is last

contains

! ****************************************************************************** !
  !> Advection relaxation routine for the D3Q19 model with BGK.
  !!
  !! \[ f_\alpha(x_i+e_{\alpha,i},t+1) =
  !! f_\alpha(x_i,t) - \omega(f_\alpha(x_i,t)-f^{eq}_{\alpha}(x_i,t)) \]
  !!
  !! The number of floating point operation in this routine is 160 roughly.
  !!
  !! This subroutine interface must match the abstract interface definition
  !! [[kernel]] in scheme/[[mus_scheme_type_module]].f90 in order to be callable
  !! via [[mus_scheme_type:compute]] function pointer.
  subroutine bgk_advRel_d3q19( fieldProp, inState, outState, auxField, &
    &                          neigh, nElems, nSolve, level, layout,   &
    &                          params, varSys, derVarPos )
    ! -------------------------------------------------------------------- !
    !> Array of field properties (fluid or species)
    type(mus_field_prop_type), intent(in) :: fieldProp(:)
    !> variable system definition
    type(tem_varSys_type), intent(in) :: varSys
    !> current layout
    type(mus_scheme_layout_type), intent(in) :: layout
    !> number of elements in state Array
    integer, intent(in) :: nElems
    !> input  pdf vector
    real(kind=rk), intent(in)  ::  inState(nElems * varSys%nScalars)
    !> output pdf vector
    real(kind=rk), intent(out) :: outState(nElems * varSys%nScalars)
    !> Auxiliary field computed from pre-collision state
    !! Is updated with correct velocity field for multicomponent models
    real(kind=rk), intent(inout) :: auxField(nElems * varSys%nAuxScalars)
    !> connectivity vector
    integer, intent(in) :: neigh(nElems * layout%fStencil%QQ)
    !> number of elements solved in kernel
    integer, intent(in) :: nSolve
    !> current level
    integer,intent(in) :: level
    !> global parameters
    type(mus_param_type),intent(in) :: params
    !> position of derived quantities in varsys for all fields
    type( mus_derVarPos_type ), intent(in) :: derVarPos(:)
    ! -------------------------------------------------------------------- !
?? IF (SOA) THEN
    ! ---------------------------------------------------------------------------
    integer       :: iElem, minElem, maxElem, ii, iLink
    real(kind=rk) ::      fN00(params%block)
    real(kind=rk) ::      f0N0(params%block)
    real(kind=rk) ::      f00N(params%block)
    real(kind=rk) ::      f100(params%block)
    real(kind=rk) ::      f010(params%block)
    real(kind=rk) ::      f001(params%block)
    real(kind=rk) ::      f0NN(params%block)
    real(kind=rk) ::      f0N1(params%block)
    real(kind=rk) ::      f01N(params%block)
    real(kind=rk) ::      f011(params%block)
    real(kind=rk) ::      fN0N(params%block)
    real(kind=rk) ::      f10N(params%block)
    real(kind=rk) ::      fN01(params%block)
    real(kind=rk) ::      f101(params%block)
    real(kind=rk) ::      fNN0(params%block)
    real(kind=rk) ::      fN10(params%block)
    real(kind=rk) ::      f1N0(params%block)
    real(kind=rk) ::      f110(params%block)
    real(kind=rk) ::      f000(params%block)
    real(kind=rk) :: usqn_o1(1:params%block),&
      &                  rho(1:params%block),&
      &                  u_x(1:params%block),&
      &                  u_y(1:params%block),&
      &                  u_z(1:params%block),&
      &                omega(1:params%block)
    real(kind=rk) :: usq
    real(kind=rk) :: sum1, sum2, ui, fac
    integer :: dens_pos, vel_pos(3), elemOff
    ! ---------------------------------------------------------------------------
    dens_pos = varSys%method%val(derVarPos(1)%density)%auxField_varPos(1)
    vel_pos = varSys%method%val(derVarPos(1)%velocity)%auxField_varPos(1:3)

!$omp do schedule(static)

    !NEC$ ivdep
    !DIR$ IVDEP
    nodeloop: do minElem = 1, nSolve, params%block

      ii = 0
      maxElem = min( minElem + params%block - 1, nSolve )

      !DIR$ IVDEP
      do iElem = minElem, maxElem
        ii = ii + 1

        ! First load all local values into temp array
        fN00(ii) = inState( neigh((qN00-1)*nElems+iElem) )
        f0N0(ii) = inState( neigh((q0N0-1)*nElems+iElem) )
        f00N(ii) = inState( neigh((q00N-1)*nElems+iElem) )
        f100(ii) = inState( neigh((q100-1)*nElems+iElem) )
        f010(ii) = inState( neigh((q010-1)*nElems+iElem) )
        f001(ii) = inState( neigh((q001-1)*nElems+iElem) )
        f0NN(ii) = inState( neigh((q0NN-1)*nElems+iElem) )
        f0N1(ii) = inState( neigh((q0N1-1)*nElems+iElem) )
        f01N(ii) = inState( neigh((q01N-1)*nElems+iElem) )
        f011(ii) = inState( neigh((q011-1)*nElems+iElem) )
        fN0N(ii) = inState( neigh((qN0N-1)*nElems+iElem) )
        f10N(ii) = inState( neigh((q10N-1)*nElems+iElem) )
        fN01(ii) = inState( neigh((qN01-1)*nElems+iElem) )
        f101(ii) = inState( neigh((q101-1)*nElems+iElem) )
        fNN0(ii) = inState( neigh((qNN0-1)*nElems+iElem) )
        fN10(ii) = inState( neigh((qN10-1)*nElems+iElem) )
        f1N0(ii) = inState( neigh((q1N0-1)*nElems+iElem) )
        f110(ii) = inState( neigh((q110-1)*nElems+iElem) )
        f000(ii) = inState( neigh((q000-1)*nElems+iElem) )

        ! element offset for auxField array
        elemOff = (iElem-1)*varSys%nAuxScalars
        ! local density
        rho(ii) = auxField(elemOff + dens_pos)
        ! local x-, y- and z-velocity
        u_x(ii) = auxField(elemOff + vel_pos(1))
        u_y(ii) = auxField(elemOff + vel_pos(2))
        u_z(ii) = auxField(elemOff + vel_pos(3))

        ! square velocity and derived constants
        usq  = u_x(ii)*u_x(ii) + u_y(ii)*u_y(ii) + u_z(ii)*u_z(ii)

        omega(ii) = fieldProp(1)%fluid%viscKine%omLvl(level)%val(iElem)
        usqn_o1(ii) = div1_36 * (1._rk - 1.5_rk * usq) * rho(ii) * omega(ii)

      end do ! iElem
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q000-1)*nElems+minElem, (q000-1)*nElems+maxElem
        ii=ii+1
        outState(iLink) = f000(ii) * (1.0_rk-omega(ii)) + 12._rk*usqn_o1(ii)
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q110-1)*nElems+minElem, (q110-1)*nElems+maxElem
        ii=ii+1
        ui    = u_x(ii) + u_y(ii)
        fac   = div1_8 * omega(ii) * ui * rho(ii)
        sum1  = fac * div3_4h
        sum2  = fac * ui  + usqn_o1(ii)
        outState(iLink) = f110(ii) * (1.0_rk-omega(ii)) + sum1 + sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qNN0-1)*nElems+minElem, (qNN0-1)*nElems+maxElem
        ii=ii+1
        ui    = u_x(ii) + u_y(ii)
        fac   = div1_8 * omega(ii) * ui * rho(ii)
        sum1  = fac * div3_4h
        sum2  = fac * ui  + usqn_o1(ii)
        outState(iLink) = fNN0(ii) * (1.0_rk-omega(ii)) - sum1 + sum2
      end do
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qN10-1)*nElems+minElem, (qN10-1)*nElems+maxElem
        ii=ii+1
        ui    = -u_x(ii) + u_y(ii)
        fac   = div1_8 * omega(ii) * ui * rho(ii)
        sum1  = fac * div3_4h
        sum2  = fac * ui  + usqn_o1(ii)
        outState(iLink) = fN10(ii) *(1.0_rk-omega(ii))+sum1 +sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q1N0-1)*nElems+minElem, (q1N0-1)*nElems+maxElem
        ii=ii+1
        ui    = -u_x(ii) + u_y(ii)
        fac   = div1_8 * omega(ii) * ui * rho(ii)
        sum1  = fac * div3_4h
        sum2  = fac * ui  + usqn_o1(ii)
        outState(iLink) = f1N0(ii) *(1.0_rk-omega(ii))-sum1 +sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q101-1)*nElems+minElem, (q101-1)*nElems+maxElem
        ii=ii+1
        ui      =  u_x(ii) + u_z(ii)
        fac  = div1_8 * omega(ii) * ui * rho(ii)
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = f101(ii)*(1.0_rk-omega(ii))+sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qN0N-1)*nElems+minElem, (qN0N-1)*nElems+maxElem
        ii=ii+1
        ui   =  u_x(ii) + u_z(ii)
        fac  = div1_8 * omega(ii) * ui * rho(ii)
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = fN0N(ii)*(1.0_rk-omega(ii))-sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qN01-1)*nElems+minElem, (qN01-1)*nElems+maxElem
        ii=ii+1
        ui   = -u_x(ii) + u_z(ii)
        fac  = div1_8 * omega(ii) * ui * rho(ii)
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = fN01(ii)*(1.0_rk-omega(ii))+sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q10N-1)*nElems+minElem, (q10N-1)*nElems+maxElem
        ii=ii+1
        ui   = -u_x(ii) + u_z(ii)
        fac  = div1_8 * omega(ii) * ui * rho(ii)
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = f10N(ii)*(1.0_rk-omega(ii))-sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q011-1)*nElems+minElem, (q011-1)*nElems+maxElem
        ii=ii+1
        ui   =  u_y(ii) + u_z(ii)
        fac  = div1_8 * omega(ii) * ui * rho(ii)
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink)=f011(ii)*(1.0_rk-omega(ii))+sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q0NN-1)*nElems+minElem, (q0NN-1)*nElems+maxElem
        ii=ii+1
       ui    =  u_y(ii) + u_z(ii)
       fac  = div1_8 * omega(ii) * ui * rho(ii)
       sum1 = fac * div3_4h
       sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = f0NN(ii)*(1.0_rk-omega(ii))-sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q0N1-1)*nElems+minElem, (q0N1-1)*nElems+maxElem
        ii=ii+1
        ui   = -u_y(ii) + u_z(ii)
        fac  = div1_8 * omega(ii) * ui * rho(ii)
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = f0N1(ii)*(1.0_rk-omega(ii))+sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q01N-1)*nElems+minElem, (q01N-1)*nElems+maxElem
        ii=ii+1
        ui   = -u_y(ii) + u_z(ii)
        fac  = div1_8 * omega(ii) * ui * rho(ii)
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = f01N(ii)*(1.0_rk-omega(ii))-sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q010-1)*nElems+minElem, (q010-1)*nElems+maxElem
        ii=ii+1
        fac   = div2_8 * omega(ii) * u_y(ii) * rho(ii)
        sum1  = fac * div3_4h
        sum2  = fac * u_y(ii) + 2d0*usqn_o1(ii)
        outState(iLink)=f010(ii) *(1.0_rk-omega(ii))+sum1 +sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q0N0-1)*nElems+minElem, (q0N0-1)*nElems+maxElem
        ii=ii+1
        fac   = div2_8 * omega(ii) * u_y(ii) * rho(ii)
        sum1  = fac * div3_4h
        sum2  = fac * u_y(ii) + 2d0*usqn_o1(ii)
        outState(iLink)=f0N0(ii) *(1.0_rk-omega(ii))-sum1 +sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qN00-1)*nElems+minElem, (qN00-1)*nElems+maxElem
        ii=ii+1
        fac   = div2_8 * omega(ii) * u_x(ii) * rho(ii)
        sum1  = fac * div3_4h
        sum2  = fac * u_x(ii) + 2d0*usqn_o1(ii)
        outState(iLink) = fN00(ii) * (1.0_rk-omega(ii)) - sum1 + sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q100-1)*nElems+minElem, (q100-1)*nElems+maxElem
        ii=ii+1
        fac   = div2_8 * omega(ii) * u_x(ii) * rho(ii)
        sum1  = fac * div3_4h
        sum2  = fac * u_x(ii) + 2d0*usqn_o1(ii)
        outState(iLink) = f100(ii) * (1.0_rk-omega(ii)) + sum1 + sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q00N-1)*nElems+minElem, (q00N-1)*nElems+maxElem
        ii=ii+1
        fac   = div2_8 * omega(ii) * u_z(ii) * rho(ii)
        sum1  = fac * div3_4h
        sum2  = fac * u_z(ii) + 2d0*usqn_o1(ii)
        outState(iLink) = f00N(ii) * (1.0_rk-omega(ii)) - sum1 +sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q001-1)*nElems+minElem, (q001-1)*nElems+maxElem
        ii=ii+1
        fac   = div2_8 * omega(ii) * u_z(ii) * rho(ii)
        sum1  = fac * div3_4h
        sum2  = fac * u_z(ii) + 2d0*usqn_o1(ii)
        outState(iLink) = f001(ii) *(1.0_rk-omega(ii))+sum1 +sum2
      end do ! iLink
      call obscure_setzero(ii)

    enddo nodeloop
!$omp end do nowait

?? ELSE
    ! ---------------------------------------------------------------------------
    integer :: iElem, nScalars
?? copy :: pdfTmp19( f )
    real(kind=rk) :: rho     ! local density
    real(kind=rk) :: u_x     ! local x-velocity
    real(kind=rk) :: u_y     ! local y-velocity
    real(kind=rk) :: u_z     ! local z-velocity
    real(kind=rk) :: usq     ! square velocity
    ! derived constants
    real(kind=rk) :: usqn, usqn_o1, usqn_o2
    real(kind=rk) :: omega_2, cmpl_o, omega
    real(kind=rk) :: coeff_1, coeff_2
    real(kind=rk) :: ui1, ui3, ui10, ui11, ui12, ui13
    real(kind=rk) :: fac_1, fac_2, fac_3, fac_4, fac_9, fac_10, fac_11, fac_12,&
      &              fac_13
    real(kind=rk) :: sum1_1, sum1_2, sum2_1, sum2_2, sum3_1, sum3_2, sum4_1,   &
      &              sum4_2, sum9_1, sum9_2, sum10_1, sum10_2, sum11_1,        &
      &              sum11_2, sum12_1, sum12_2, sum13_1, sum13_2
    integer :: dens_pos, vel_pos(3), elemOff
    ! ---------------------------------------------------------------------------
    dens_pos = varSys%method%val(derVarPos(1)%density)%auxField_varPos(1)
    vel_pos = varSys%method%val(derVarPos(1)%velocity)%auxField_varPos(1:3)

    ! nElems = size(neigh)/QQ
    nScalars = varSys%nScalars

!$omp do schedule(static)

?? copy :: dir_novec
    nodeloop: do iElem = 1, nSolve
      ! First load all local values into temp array
      fN00 = inState(?FETCH?( qN00, 1, iElem, QQ, nScalars, nElems,neigh ))
      f0N0 = inState(?FETCH?( q0N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f00N = inState(?FETCH?( q00N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f100 = inState(?FETCH?( q100, 1, iElem, QQ, nScalars, nElems,neigh ))
      f010 = inState(?FETCH?( q010, 1, iElem, QQ, nScalars, nElems,neigh ))
      f001 = inState(?FETCH?( q001, 1, iElem, QQ, nScalars, nElems,neigh ))
      f0NN = inState(?FETCH?( q0NN, 1, iElem, QQ, nScalars, nElems,neigh ))
      f0N1 = inState(?FETCH?( q0N1, 1, iElem, QQ, nScalars, nElems,neigh ))
      f01N = inState(?FETCH?( q01N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f011 = inState(?FETCH?( q011, 1, iElem, QQ, nScalars, nElems,neigh ))
      fN0N = inState(?FETCH?( qN0N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f10N = inState(?FETCH?( q10N, 1, iElem, QQ, nScalars, nElems,neigh ))
      fN01 = inState(?FETCH?( qN01, 1, iElem, QQ, nScalars, nElems,neigh ))
      f101 = inState(?FETCH?( q101, 1, iElem, QQ, nScalars, nElems,neigh ))
      fNN0 = inState(?FETCH?( qNN0, 1, iElem, QQ, nScalars, nElems,neigh ))
      fN10 = inState(?FETCH?( qN10, 1, iElem, QQ, nScalars, nElems,neigh ))
      f1N0 = inState(?FETCH?( q1N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f110 = inState(?FETCH?( q110, 1, iElem, QQ, nScalars, nElems,neigh ))
      f000 = inState(?FETCH?( q000, 1, iElem, QQ, nScalars, nElems,neigh ))

      ! element offset for auxField array
      elemOff = (iElem-1)*varSys%nAuxScalars
      ! local density
      rho = auxField(elemOff + dens_pos)
      ! local x-, y- and z-velocity
      u_x = auxField(elemOff + vel_pos(1))
      u_y = auxField(elemOff + vel_pos(2))
      u_z = auxField(elemOff + vel_pos(3))

      ! square velocity and derived constants
      usq  = u_x*u_x + u_y*u_y + u_z*u_z
      usqn = div1_36 * (1._rk - 1.5_rk * usq) * rho

      ! read the relaxation parameter omega for the current level
      omega   = fieldProp(1)%fluid%viscKine%omLvl(level)%val(iElem)
      ! pre-calculate partial collision constants
      cmpl_o  = 1._rk - omega

      ! f = (1-w) * f + w * fEq
      outState(?SAVE?(q000,1, iElem, QQ, nScalars,nElems,neigh)) =             &
        &                     f000*cmpl_o+omega*rho*(div1_3-0.5_rk*usq)

      coeff_1 = div1_8 * omega * rho

      usqn_o1 = omega * usqn

      ui1     =  u_x + u_y
      fac_1   = coeff_1 * ui1
      sum1_1  = fac_1 * div3_4h
      sum1_2  = fac_1 * ui1 + usqn_o1

      outState(?SAVE?(q110,1, iElem, QQ, nScalars,nElems,neigh)) =             &
        &                                   f110 *cmpl_o+sum1_1 +sum1_2
      outState(?SAVE?(qNN0,1, iElem, QQ, nScalars,nElems,neigh)) =             &
        &                                   fNN0 *cmpl_o-sum1_1 +sum1_2

      ui3     = -u_x + u_y
      fac_3   = coeff_1 * ui3
      sum3_1  = fac_3 * div3_4h
      sum3_2  = fac_3 * ui3 + usqn_o1

      outState(?SAVE?(qN10,1, iElem, QQ, nScalars,nElems,neigh)) =             &
        &                                   fN10 *cmpl_o+sum3_1 +sum3_2
      outState(?SAVE?(q1N0,1, iElem, QQ, nScalars,nElems,neigh)) =             &
        &                                   f1N0 *cmpl_o-sum3_1 +sum3_2

      ui10    =  u_x + u_z
      fac_10  = coeff_1 * ui10
      sum10_1 = fac_10 * div3_4h
      sum10_2 = fac_10 * ui10 + usqn_o1

      outState(?SAVE?(q101,1, iElem, QQ, nScalars,nElems,neigh)) =             &
        &                                   f101*cmpl_o+sum10_1+sum10_2
      outState(?SAVE?(qN0N,1, iElem, QQ, nScalars,nElems,neigh)) =             &
        &                                   fN0N*cmpl_o-sum10_1+sum10_2

      ui12    = -u_x + u_z
      fac_12  = coeff_1 * ui12
      sum12_1 = fac_12 * div3_4h
      sum12_2 = fac_12 * ui12 + usqn_o1

      outState(?SAVE?(qN01,1, iElem, QQ, nScalars,nElems,neigh)) =             &
        &                                   fN01*cmpl_o+sum12_1+sum12_2
      outState(?SAVE?(q10N,1, iElem, QQ, nScalars,nElems,neigh)) =             &
        &                                   f10N*cmpl_o-sum12_1+sum12_2

      ui11    =  u_y + u_z
      fac_11  = coeff_1 * ui11
      sum11_1 = fac_11 * div3_4h
      sum11_2 = fac_11 * ui11 + usqn_o1

      outState(?SAVE?(q011,1, iElem, QQ, nScalars,nElems,neigh)) =             &
        &                                   f011*cmpl_o+sum11_1+sum11_2
      outState(?SAVE?(q0NN,1, iElem, QQ, nScalars,nElems,neigh)) =             &
        &                                   f0NN*cmpl_o-sum11_1+sum11_2

      ui13    = -u_y + u_z
      fac_13  = coeff_1 * ui13
      sum13_1 = fac_13 * div3_4h
      sum13_2 = fac_13 * ui13 + usqn_o1

      outState(?SAVE?(q0N1,1, iElem, QQ, nScalars,nElems,neigh)) =             &
        &                                   f0N1*cmpl_o+sum13_1+sum13_2
      outState(?SAVE?(q01N,1, iElem, QQ, nScalars,nElems,neigh)) =             &
        &                                   f01N*cmpl_o-sum13_1+sum13_2

      omega_2 = 2._rk * omega
      coeff_2 = div1_8 * omega_2 * rho
      usqn_o2 = omega_2 * usqn

      fac_2   = coeff_2 * u_y
      sum2_1  = fac_2 * div3_4h
      sum2_2  = fac_2 * u_y + usqn_o2

      outState(?SAVE?(q010,1, iElem, QQ, nScalars,nElems,neigh )) =            &
        &                                   f010 *cmpl_o+sum2_1 +sum2_2
      outState(?SAVE?(q0N0,1, iElem, QQ, nScalars,nElems,neigh )) =            &
        &                                   f0N0 *cmpl_o-sum2_1 +sum2_2

      fac_4   = coeff_2 * u_x
      sum4_1  = fac_4 * div3_4h
      sum4_2  = fac_4 * u_x + usqn_o2

      outState(?SAVE?(qN00,1, iElem, QQ, nScalars,nElems,neigh )) =            &
        &                                   fN00 *cmpl_o-sum4_1 +sum4_2
      outState(?SAVE?(q100,1, iElem, QQ, nScalars,nElems,neigh )) =            &
        &                                   f100 *cmpl_o+sum4_1 +sum4_2

      fac_9   = coeff_2 * u_z
      sum9_1  = fac_9 * div3_4h
      sum9_2  = fac_9 * u_z + usqn_o2

      outState(?SAVE?(q001,1, iElem, QQ, nScalars,nElems,neigh )) =            &
        &                                   f001 *cmpl_o+sum9_1 +sum9_2
      outState(?SAVE?(q00N,1, iElem, QQ, nScalars,nElems,neigh )) =            &
        &                                   f00N *cmpl_o-sum9_1 +sum9_2

    enddo nodeloop
!$omp end do nowait
?? END IF

  end subroutine bgk_advRel_d3q19
! ****************************************************************************** !


! ****************************************************************************** !
  !> Advection relaxation routine for the D3Q19 model with BGK for
  !! incompressible lbm model
  !!
  !! This subroutine interface must match the abstract interface definition
  !! [[kernel]] in scheme/[[mus_scheme_type_module]].f90 in order to be callable
  !! via [[mus_scheme_type:compute]] function pointer.
  subroutine bgk_advRel_d3q19_incomp( fieldProp, inState, outState, auxField, &
    &                                 neigh, nElems, nSolve, level, layout,   &
    &                                 params, varSys, derVarPos               )
    ! -------------------------------------------------------------------- !
    !> Array of field properties (fluid or species)
    type(mus_field_prop_type), intent(in) :: fieldProp(:)
    !> variable system definition
    type(tem_varSys_type), intent(in) :: varSys
    !> current layout
    type(mus_scheme_layout_type), intent(in) :: layout
    !> number of elements in state Array
    integer, intent(in) :: nElems
    !> input  pdf vector
    real(kind=rk), intent(in)  ::  inState(nElems * varSys%nScalars)
    !> output pdf vector
    real(kind=rk), intent(out) :: outState(nElems * varSys%nScalars)
    !> Auxiliary field computed from pre-collision state
    !! Is updated with correct velocity field for multicomponent models
    real(kind=rk), intent(inout) :: auxField(nElems * varSys%nAuxScalars)
    !> connectivity vector
    integer, intent(in) :: neigh(nElems * layout%fStencil%QQ)
    !> number of elements solved in kernel
    integer, intent(in) :: nSolve
    !> current level
    integer,intent(in) :: level
    !> global parameters
    type(mus_param_type),intent(in) :: params
    !> position of derived quantities in varsys for all fields
    type( mus_derVarPos_type ), intent(in) :: derVarPos(:)
    ! -------------------------------------------------------------------- !
?? IF (SOA) THEN
    ! ---------------------------------------------------------------------------
    integer       :: iElem, minElem, maxElem, ii, iLink
    real(kind=rk) ::      fN00(params%block)
    real(kind=rk) ::      f0N0(params%block)
    real(kind=rk) ::      f00N(params%block)
    real(kind=rk) ::      f100(params%block)
    real(kind=rk) ::      f010(params%block)
    real(kind=rk) ::      f001(params%block)
    real(kind=rk) ::      f0NN(params%block)
    real(kind=rk) ::      f0N1(params%block)
    real(kind=rk) ::      f01N(params%block)
    real(kind=rk) ::      f011(params%block)
    real(kind=rk) ::      fN0N(params%block)
    real(kind=rk) ::      f10N(params%block)
    real(kind=rk) ::      fN01(params%block)
    real(kind=rk) ::      f101(params%block)
    real(kind=rk) ::      fNN0(params%block)
    real(kind=rk) ::      fN10(params%block)
    real(kind=rk) ::      f1N0(params%block)
    real(kind=rk) ::      f110(params%block)
    real(kind=rk) ::      f000(params%block)
    real(kind=rk) :: usqn_o1(1:params%block),&
      &                  u_x(1:params%block),&
      &                  u_y(1:params%block),&
      &                  u_z(1:params%block),&
      &                omega(1:params%block)
    real(kind=rk) :: rho, usq
    real(kind=rk) :: sum1, sum2, ui, fac
    integer :: dens_pos, vel_pos(3), elemOff
    ! ---------------------------------------------------------------------------
    dens_pos = varSys%method%val(derVarPos(1)%density)%auxField_varPos(1)
    vel_pos = varSys%method%val(derVarPos(1)%velocity)%auxField_varPos(1:3)

!$omp do schedule(static)

    !NEC$ ivdep
    !DIR$ IVDEP
    nodeloop: do minElem = 1, nSolve, params%block

      ii = 0
      maxElem = min( minElem + params%block - 1, nSolve )

      !DIR$ ivdep
      do iElem = minElem, maxElem
        ii = ii + 1

        ! First load all local values into temp array
        fN00(ii) = inState( neigh((qN00-1)*nElems+iElem) )
        f0N0(ii) = inState( neigh((q0N0-1)*nElems+iElem) )
        f00N(ii) = inState( neigh((q00N-1)*nElems+iElem) )
        f100(ii) = inState( neigh((q100-1)*nElems+iElem) )
        f010(ii) = inState( neigh((q010-1)*nElems+iElem) )
        f001(ii) = inState( neigh((q001-1)*nElems+iElem) )
        f0NN(ii) = inState( neigh((q0NN-1)*nElems+iElem) )
        f0N1(ii) = inState( neigh((q0N1-1)*nElems+iElem) )
        f01N(ii) = inState( neigh((q01N-1)*nElems+iElem) )
        f011(ii) = inState( neigh((q011-1)*nElems+iElem) )
        fN0N(ii) = inState( neigh((qN0N-1)*nElems+iElem) )
        f10N(ii) = inState( neigh((q10N-1)*nElems+iElem) )
        fN01(ii) = inState( neigh((qN01-1)*nElems+iElem) )
        f101(ii) = inState( neigh((q101-1)*nElems+iElem) )
        fNN0(ii) = inState( neigh((qNN0-1)*nElems+iElem) )
        fN10(ii) = inState( neigh((qN10-1)*nElems+iElem) )
        f1N0(ii) = inState( neigh((q1N0-1)*nElems+iElem) )
        f110(ii) = inState( neigh((q110-1)*nElems+iElem) )
        f000(ii) = inState( neigh((q000-1)*nElems+iElem) )

        ! element offset for auxField array
        elemOff = (iElem-1)*varSys%nAuxScalars
        ! local density
        rho = auxField(elemOff + dens_pos)
        ! local x-, y- and z-velocity
        u_x(ii) = auxField(elemOff + vel_pos(1))
        u_y(ii) = auxField(elemOff + vel_pos(2))
        u_z(ii) = auxField(elemOff + vel_pos(3))

        ! square velocity and derived constants
        usq  = u_x(ii)*u_x(ii) + u_y(ii)*u_y(ii) + u_z(ii)*u_z(ii)

        omega(ii) = fieldProp(1)%fluid%viscKine%omLvl(level)%val(iElem)
        usqn_o1(ii) = omega(ii) * div1_36 * ( rho - 1.5d0 * usq )

        ! @todo: also write this in a link loop?
        ! outState(?SAVE?(q000,1,iElem,QQ,QQ,nElems,neigh)) = f000(ii)*cmpl_o + 12d0*usqn_o1(ii)

      end do ! iElem
      call obscure_setzero(ii)

      !NEC$ ivdep
?? copy :: dir_vector
      do iLink = (q000-1)*nElems+minElem, (q000-1)*nElems+maxElem
        ii=ii+1
        outState(iLink) = f000(ii)*cmpl_o + 12d0*usqn_o1(ii)
      end do ! iLink
      call obscure_setzero(ii)

      !NEC$ ivdep
?? copy :: dir_vector
      do iLink = (q110-1)*nElems+minElem, (q110-1)*nElems+maxElem
        ii=ii+1
        ui    = u_x(ii) + u_y(ii)
        fac   = div1_8 * omega(ii) * ui
        sum1  = fac * div3_4h
        sum2  = fac * ui  + usqn_o1(ii)
        outState(iLink) = f110(ii) * (1.0_rk-omega(ii)) + sum1 + sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qNN0-1)*nElems+minElem, (qNN0-1)*nElems+maxElem
        ii=ii+1
        ui    = u_x(ii) + u_y(ii)
        fac   = div1_8 * omega(ii) * ui
        sum1  = fac * div3_4h
        sum2  = fac * ui  + usqn_o1(ii)
        outState(iLink) = fNN0(ii) * (1.0_rk-omega(ii)) - sum1 + sum2
      end do
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qN10-1)*nElems+minElem, (qN10-1)*nElems+maxElem
        ii=ii+1
        ui    = -u_x(ii) + u_y(ii)
        fac   = div1_8 * omega(ii) * ui
        sum1  = fac * div3_4h
        sum2  = fac * ui  + usqn_o1(ii)
        outState(iLink) = fN10(ii) *(1.0_rk-omega(ii))+sum1 +sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q1N0-1)*nElems+minElem, (q1N0-1)*nElems+maxElem
        ii=ii+1
        ui    = -u_x(ii) + u_y(ii)
        fac   = div1_8 * omega(ii) * ui
        sum1  = fac * div3_4h
        sum2  = fac * ui  + usqn_o1(ii)
        outState(iLink) = f1N0(ii) *(1.0_rk-omega(ii))-sum1 +sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q101-1)*nElems+minElem, (q101-1)*nElems+maxElem
        ii=ii+1
        ui      =  u_x(ii) + u_z(ii)
        fac  = div1_8 * omega(ii) * ui
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = f101(ii)*(1.0_rk-omega(ii))+sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qN0N-1)*nElems+minElem, (qN0N-1)*nElems+maxElem
        ii=ii+1
        ui   =  u_x(ii) + u_z(ii)
        fac  = div1_8 * omega(ii) * ui
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = fN0N(ii)*(1.0_rk-omega(ii))-sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qN01-1)*nElems+minElem, (qN01-1)*nElems+maxElem
        ii=ii+1
        ui   = -u_x(ii) + u_z(ii)
        fac  = div1_8 * omega(ii) * ui
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = fN01(ii)*(1.0_rk-omega(ii))+sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q10N-1)*nElems+minElem, (q10N-1)*nElems+maxElem
        ii=ii+1
        ui   = -u_x(ii) + u_z(ii)
        fac  = div1_8 * omega(ii) * ui
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = f10N(ii)*(1.0_rk-omega(ii))-sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q011-1)*nElems+minElem, (q011-1)*nElems+maxElem
        ii=ii+1
        ui   =  u_y(ii) + u_z(ii)
        fac  = div1_8 * omega(ii) * ui
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink)=f011(ii)*(1.0_rk-omega(ii))+sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q0NN-1)*nElems+minElem, (q0NN-1)*nElems+maxElem
        ii=ii+1
       ui    =  u_y(ii) + u_z(ii)
       fac  = div1_8 * omega(ii) * ui
       sum1 = fac * div3_4h
       sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = f0NN(ii)*(1.0_rk-omega(ii))-sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q0N1-1)*nElems+minElem, (q0N1-1)*nElems+maxElem
        ii=ii+1
        ui   = -u_y(ii) + u_z(ii)
        fac  = div1_8 * omega(ii) * ui
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = f0N1(ii)*(1.0_rk-omega(ii))+sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q01N-1)*nElems+minElem, (q01N-1)*nElems+maxElem
        ii=ii+1
        ui   = -u_y(ii) + u_z(ii)
        fac  = div1_8 * omega(ii) * ui
        sum1 = fac * div3_4h
        sum2 = fac * ui + usqn_o1(ii)
        outState(iLink) = f01N(ii)*(1.0_rk-omega(ii))-sum1+sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q010-1)*nElems+minElem, (q010-1)*nElems+maxElem
        ii=ii+1
        fac   = div2_8 * omega(ii) * u_y(ii)
        sum1  = fac * div3_4h
        sum2  = fac * u_y(ii) + 2d0*usqn_o1(ii)
        outState(iLink)=f010(ii) *(1.0_rk-omega(ii))+sum1 +sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q0N0-1)*nElems+minElem, (q0N0-1)*nElems+maxElem
        ii=ii+1
        fac   = div2_8 * omega(ii) * u_y(ii)
        sum1  = fac * div3_4h
        sum2  = fac * u_y(ii) + 2d0*usqn_o1(ii)
        outState(iLink)=f0N0(ii) *(1.0_rk-omega(ii))-sum1 +sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qN00-1)*nElems+minElem, (qN00-1)*nElems+maxElem
        ii=ii+1
        fac   = div2_8 * omega(ii) * u_x(ii)
        sum1  = fac * div3_4h
        sum2  = fac * u_x(ii) + 2d0*usqn_o1(ii)
        outState(iLink) = fN00(ii) * (1.0_rk-omega(ii)) - sum1 + sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q100-1)*nElems+minElem, (q100-1)*nElems+maxElem
        ii=ii+1
        fac   = div2_8 * omega(ii) * u_x(ii)
        sum1  = fac * div3_4h
        sum2  = fac * u_x(ii) + 2d0*usqn_o1(ii)
        outState(iLink) = f100(ii) * (1.0_rk-omega(ii)) + sum1 + sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q00N-1)*nElems+minElem, (q00N-1)*nElems+maxElem
        ii=ii+1
        fac   = div2_8 * omega(ii) * u_z(ii)
        sum1  = fac * div3_4h
        sum2  = fac * u_z(ii) + 2d0*usqn_o1(ii)
        outState(iLink) = f00N(ii) * (1.0_rk-omega(ii)) - sum1 +sum2
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q001-1)*nElems+minElem, (q001-1)*nElems+maxElem
        ii=ii+1
        fac   = div2_8 * omega(ii) * u_z(ii)
        sum1  = fac * div3_4h
        sum2  = fac * u_z(ii) + 2d0*usqn_o1(ii)
        outState(iLink) = f001(ii) *(1.0_rk-omega(ii))+sum1 +sum2
      end do ! iLink
      call obscure_setzero(ii)

    enddo nodeloop
!$omp end do nowait

?? ELSE !! SOA == FALSE
    ! ---------------------------------------------------------------------------
    integer :: iElem
?? copy :: pdfTmp19( f )
    real(kind=rk) rho, u_x, u_y, u_z, usq
    real(kind=rk) usqn_o1, usqn_o2
    real(kind=rk) cmpl_o, omega
    real(kind=rk) coeff_1, coeff_2
    real(kind=rk) ui1, ui3, ui10, ui11, ui12, ui13
    real(kind=rk) fac_1, fac_2, fac_3, fac_4, fac_9, fac_10, fac_11, fac_12,   &
      &           fac_13
    real(kind=rk) sum1_1, sum1_2, sum2_1, sum2_2, sum3_1, sum3_2, sum4_1,      &
      &           sum4_2, sum9_1, sum9_2, sum10_1, sum10_2, sum11_1, sum11_2,  &
      &           sum12_1, sum12_2, sum13_1, sum13_2
    integer :: dens_pos, vel_pos(3), elemOff
    ! ---------------------------------------------------------------------------
    dens_pos = varSys%method%val(derVarPos(1)%density)%auxField_varPos(1)
    vel_pos = varSys%method%val(derVarPos(1)%velocity)%auxField_varPos(1:3)

!$omp do schedule(static)

    !NEC$ ivdep
?? copy :: dir_novec
    nodeloop: do iElem=1,nSolve
      ! First load all local values into temp array
      fN00 = inState(?FETCH?( qN00, 1, iElem, QQ, QQ, nElems,neigh ))
      f0N0 = inState(?FETCH?( q0N0, 1, iElem, QQ, QQ, nElems,neigh ))
      f00N = inState(?FETCH?( q00N, 1, iElem, QQ, QQ, nElems,neigh ))
      f100 = inState(?FETCH?( q100, 1, iElem, QQ, QQ, nElems,neigh ))
      f010 = inState(?FETCH?( q010, 1, iElem, QQ, QQ, nElems,neigh ))
      f001 = inState(?FETCH?( q001, 1, iElem, QQ, QQ, nElems,neigh ))
      f0NN = inState(?FETCH?( q0NN, 1, iElem, QQ, QQ, nElems,neigh ))
      f0N1 = inState(?FETCH?( q0N1, 1, iElem, QQ, QQ, nElems,neigh ))
      f01N = inState(?FETCH?( q01N, 1, iElem, QQ, QQ, nElems,neigh ))
      f011 = inState(?FETCH?( q011, 1, iElem, QQ, QQ, nElems,neigh ))
      fN0N = inState(?FETCH?( qN0N, 1, iElem, QQ, QQ, nElems,neigh ))
      f10N = inState(?FETCH?( q10N, 1, iElem, QQ, QQ, nElems,neigh ))
      fN01 = inState(?FETCH?( qN01, 1, iElem, QQ, QQ, nElems,neigh ))
      f101 = inState(?FETCH?( q101, 1, iElem, QQ, QQ, nElems,neigh ))
      fNN0 = inState(?FETCH?( qNN0, 1, iElem, QQ, QQ, nElems,neigh ))
      fN10 = inState(?FETCH?( qN10, 1, iElem, QQ, QQ, nElems,neigh ))
      f1N0 = inState(?FETCH?( q1N0, 1, iElem, QQ, QQ, nElems,neigh ))
      f110 = inState(?FETCH?( q110, 1, iElem, QQ, QQ, nElems,neigh ))
      f000 = inState(?FETCH?( q000, 1, iElem, QQ, QQ, nElems,neigh ))

      ! element offset for auxField array
      elemOff = (iElem-1)*varSys%nAuxScalars
      ! local density
      rho = auxField(elemOff + dens_pos)
      ! local x-, y- and z-velocity
      u_x = auxField(elemOff + vel_pos(1))
      u_y = auxField(elemOff + vel_pos(2))
      u_z = auxField(elemOff + vel_pos(3))

      ! square velocity and derived constants
      usq  = u_x*u_x + u_y*u_y + u_z*u_z

      ! read the relaxation parameter omega for the current level
      omega   = fieldProp(1)%fluid%viscKine%omLvl(level)%val(iElem)
      ! pre-calculate partial collision constants
      cmpl_o  = 1._rk - omega

      ! usqn = div1_36 * (rho - 1.5_rk * usq * rho0 )
      usqn_o1 = omega * div1_36 * ( rho - 1.5d0 * usq )

      outState(?SAVE?(q000,1,iElem,QQ,QQ,nElems,neigh))=f000*cmpl_o + 12d0*usqn_o1

      coeff_1 = div1_8 * omega

      ui1     =  u_x + u_y
      fac_1   = coeff_1 * ui1
      sum1_1  = fac_1 * div3_4h
      sum1_2  = fac_1 * ui1 + usqn_o1

      outState(?SAVE?(q110,1,iElem,QQ,QQ,nElems,neigh))=f110 *cmpl_o+sum1_1 +sum1_2
      outState(?SAVE?(qNN0,1,iElem,QQ,QQ,nElems,neigh))=fNN0 *cmpl_o-sum1_1 +sum1_2

      ui3     = -u_x + u_y
      fac_3   = coeff_1 * ui3
      sum3_1  = fac_3 * div3_4h
      sum3_2  = fac_3 * ui3 + usqn_o1

      outState(?SAVE?(qN10,1,iElem,QQ,QQ,nElems,neigh))=fN10 *cmpl_o+sum3_1 +sum3_2
      outState(?SAVE?(q1N0,1,iElem,QQ,QQ,nElems,neigh))=f1N0 *cmpl_o-sum3_1 +sum3_2

      ui10    =  u_x + u_z
      fac_10  = coeff_1 * ui10
      sum10_1 = fac_10 * div3_4h
      sum10_2 = fac_10 * ui10 + usqn_o1

      outState(?SAVE?(q101,1,iElem,QQ,QQ,nElems,neigh))=f101*cmpl_o+sum10_1+sum10_2
      outState(?SAVE?(qN0N,1,iElem,QQ,QQ,nElems,neigh))=fN0N*cmpl_o-sum10_1+sum10_2

      ui12    = -u_x + u_z
      fac_12  = coeff_1 * ui12
      sum12_1 = fac_12 * div3_4h
      sum12_2 = fac_12 * ui12 + usqn_o1

      outState(?SAVE?(qN01,1,iElem,QQ,QQ,nElems,neigh))=fN01*cmpl_o+sum12_1+sum12_2
      outState(?SAVE?(q10N,1,iElem,QQ,QQ,nElems,neigh))=f10N*cmpl_o-sum12_1+sum12_2

      ui11    =  u_y + u_z
      fac_11  = coeff_1 * ui11
      sum11_1 = fac_11 * div3_4h
      sum11_2 = fac_11 * ui11 + usqn_o1

      outState(?SAVE?(q011,1,iElem,QQ,QQ,nElems,neigh))=f011*cmpl_o+sum11_1+sum11_2
      outState(?SAVE?(q0NN,1,iElem,QQ,QQ,nElems,neigh))=f0NN*cmpl_o-sum11_1+sum11_2

      ui13    = -u_y + u_z
      fac_13  = coeff_1 * ui13
      sum13_1 = fac_13 * div3_4h
      sum13_2 = fac_13 * ui13 + usqn_o1

      outState(?SAVE?(q0N1,1,iElem,QQ,QQ,nElems,neigh))=f0N1*cmpl_o+sum13_1+sum13_2
      outState(?SAVE?(q01N,1,iElem,QQ,QQ,nElems,neigh))=f01N*cmpl_o-sum13_1+sum13_2

      coeff_2 = div1_8 * omega * 2.0_rk
      usqn_o2 = 2d0 * usqn_o1

      fac_2   = coeff_2 * u_y
      sum2_1  = fac_2 * div3_4h
      sum2_2  = fac_2 * u_y + usqn_o2

      outState(?SAVE?(q010,1,iElem,QQ,QQ,nElems,neigh ))=f010 *cmpl_o+sum2_1 +sum2_2
      outState(?SAVE?(q0N0,1,iElem,QQ,QQ,nElems,neigh ))=f0N0 *cmpl_o-sum2_1 +sum2_2

      fac_4   = coeff_2 * u_x
      sum4_1  = fac_4 * div3_4h
      sum4_2  = fac_4 * u_x + usqn_o2

      outState(?SAVE?(qN00,1,iElem,QQ,QQ,nElems,neigh ))=fN00 *cmpl_o-sum4_1 +sum4_2
      outState(?SAVE?(q100,1,iElem,QQ,QQ,nElems,neigh ))=f100 *cmpl_o+sum4_1 +sum4_2

      fac_9   = coeff_2 * u_z
      sum9_1  = fac_9 * div3_4h
      sum9_2  = fac_9 * u_z + usqn_o2

      outState(?SAVE?(q001,1,iElem,QQ,QQ,nElems,neigh ))=f001 *cmpl_o+sum9_1 +sum9_2
      outState(?SAVE?(q00N,1,iElem,QQ,QQ,nElems,neigh ))=f00N *cmpl_o-sum9_1 +sum9_2

    enddo nodeloop
!$omp end do nowait
?? END IF

  end subroutine bgk_advRel_d3q19_incomp
! ****************************************************************************** !


! ****************************************************************************** !
  !> Advection relaxation routine for the D3Q19 model with TRT collision
  !! operator
  !! In TRT, there are two relaxation parameters one can choose.
  !! They have a relationship, which is so-called magic number:
  !! Lambda = ( 1/omegaP - 1/2 ) * ( 1/omegaN - 1/2 )
  !! Different value of Lambda results different error:
  !! Lambda = 1/4 is the best stability for the LBE. As well, this number gives
  !! the solution for the steady-state case dependant only on the equilibirium
  !! funciton.
  !! Lambda = 1/12 removes the third-order advection error
  !! Lambda = 1/6 removes fourth-order diffusion errors
  !! Lambda = 3/16 gives exact location of bounce-back walls for the Poiseuille
  !! flow.
  !! omegaP is usually fixed by viscosity, another one is fixed through the
  !! above magic number combination.
  !!
  !! This subroutine interface must match the abstract interface definition
  !! [[kernel]] in scheme/[[mus_scheme_type_module]].f90 in order to be callable
  !! via [[mus_scheme_type:compute]] function pointer.
  subroutine trt_advRel_d3q19( fieldProp, inState, outState, auxField, &
    &                          neigh, nElems, nSolve, level, layout,   &
    &                          params, varSys, derVarPos               )
    ! -------------------------------------------------------------------- !
    !> Array of field properties (fluid or species)
    type(mus_field_prop_type), intent(in) :: fieldProp(:)
    !> variable system definition
    type(tem_varSys_type), intent(in) :: varSys
    !> current layout
    type(mus_scheme_layout_type), intent(in) :: layout
    !> number of elements in state Array
    integer, intent(in) :: nElems
    !> input  pdf vector
    real(kind=rk), intent(in)  ::  inState(nElems * varSys%nScalars)
    !> output pdf vector
    real(kind=rk), intent(out) :: outState(nElems * varSys%nScalars)
    !> Auxiliary field computed from pre-collision state
    !! Is updated with correct velocity field for multicomponent models
    real(kind=rk), intent(inout) :: auxField(nElems * varSys%nAuxScalars)
    !> connectivity vector
    integer, intent(in) :: neigh(nElems * layout%fStencil%QQ)
    !> number of elements solved in kernel
    integer, intent(in) :: nSolve
    !> current level
    integer,intent(in) :: level
    !> global parameters
    type(mus_param_type),intent(in) :: params
    !> position of derived quantities in varsys for all fields
    type( mus_derVarPos_type ), intent(in) :: derVarPos(:)
    ! -------------------------------------------------------------------- !
    ! ---------------------------------------------------------------------------
?? IF (SOA) THEN
    integer       :: iElem, minElem, maxElem, ii, iLink
    real(kind=rk) ::      fN00(params%block)
    real(kind=rk) ::      f0N0(params%block)
    real(kind=rk) ::      f00N(params%block)
    real(kind=rk) ::      f100(params%block)
    real(kind=rk) ::      f010(params%block)
    real(kind=rk) ::      f001(params%block)
    real(kind=rk) ::      f0NN(params%block)
    real(kind=rk) ::      f0N1(params%block)
    real(kind=rk) ::      f01N(params%block)
    real(kind=rk) ::      f011(params%block)
    real(kind=rk) ::      fN0N(params%block)
    real(kind=rk) ::      f10N(params%block)
    real(kind=rk) ::      fN01(params%block)
    real(kind=rk) ::      f101(params%block)
    real(kind=rk) ::      fNN0(params%block)
    real(kind=rk) ::      fN10(params%block)
    real(kind=rk) ::      f1N0(params%block)
    real(kind=rk) ::      f110(params%block)
    real(kind=rk) feq_common(1:params%block),&
      &                  rho(1:params%block),&
      &                  u_x(1:params%block),&
      &                  u_y(1:params%block),&
      &                  u_z(1:params%block),&
      &              omega_h(1:params%block),&
      &         asym_omega_h(1:params%block)
    real(kind=rk) :: f000
    real(kind=rk) :: usq
    real(kind=rk) :: asym, sym
    real(kind=rk) :: t1x2,t2x2,fac1,fac2
    real(kind=rk) :: omega, asym_omega
    ! derived constants
    real(kind=rk) :: ui, fac
    real(kind=rk), parameter :: t1x2_0 = 1.d0/18.d0 * 2.d0
    real(kind=rk), parameter :: t2x2_0 = 1.d0/36.d0 * 2.d0
    integer :: dens_pos, vel_pos(3), elemOff
    ! ---------------------------------------------------------------------------
    dens_pos = varSys%method%val(derVarPos(1)%density)%auxField_varPos(1)
    vel_pos = varSys%method%val(derVarPos(1)%velocity)%auxField_varPos(1:3)

!$omp do schedule(static)

    !NEC$ ivdep
    !DIR$ IVDEP
    nodeloop: do minElem = 1, nSolve, params%block

      ii = 0
      maxElem = min( minElem + params%block - 1, nSolve )

?? copy :: dir_NOVEC
      do iElem = minElem, maxElem
        ii = ii + 1

        ! First load all local values into temp array
        fN00(ii) = inState( neigh((qN00-1)*nElems+iElem) )
        f0N0(ii) = inState( neigh((q0N0-1)*nElems+iElem) )
        f00N(ii) = inState( neigh((q00N-1)*nElems+iElem) )
        f100(ii) = inState( neigh((q100-1)*nElems+iElem) )
        f010(ii) = inState( neigh((q010-1)*nElems+iElem) )
        f001(ii) = inState( neigh((q001-1)*nElems+iElem) )
        f0NN(ii) = inState( neigh((q0NN-1)*nElems+iElem) )
        f0N1(ii) = inState( neigh((q0N1-1)*nElems+iElem) )
        f01N(ii) = inState( neigh((q01N-1)*nElems+iElem) )
        f011(ii) = inState( neigh((q011-1)*nElems+iElem) )
        fN0N(ii) = inState( neigh((qN0N-1)*nElems+iElem) )
        f10N(ii) = inState( neigh((q10N-1)*nElems+iElem) )
        fN01(ii) = inState( neigh((qN01-1)*nElems+iElem) )
        f101(ii) = inState( neigh((q101-1)*nElems+iElem) )
        fNN0(ii) = inState( neigh((qNN0-1)*nElems+iElem) )
        fN10(ii) = inState( neigh((qN10-1)*nElems+iElem) )
        f1N0(ii) = inState( neigh((q1N0-1)*nElems+iElem) )
        f110(ii) = inState( neigh((q110-1)*nElems+iElem) )
        f000     = inState( neigh((q000-1)*nElems+iElem) )

        ! element offset for auxField array
        elemOff = (iElem-1)*varSys%nAuxScalars
        ! local density
        rho(ii) = auxField(elemOff + dens_pos)
        ! local x-, y- and z-velocity
        u_x(ii) = auxField(elemOff + vel_pos(1))
        u_y(ii) = auxField(elemOff + vel_pos(2))
        u_z(ii) = auxField(elemOff + vel_pos(3))

        ! square velocity and derived constants
        usq  = u_x(ii)*u_x(ii) + u_y(ii)*u_y(ii) + u_z(ii)*u_z(ii)
        feq_common(ii) = 1.d0 - 1.5d0 * usq

        omega = fieldProp(1)%fluid%viscKine%omLvl(level)%val(iElem)
        omega_h(ii) = 0.5_rk * omega ! half omega

        asym_omega = 1.0_rk / ( 0.5_rk + fieldProp(1)%fluid%lambda &
          &                   / ( 1.0_rk/omega - 0.5_rk ) )
        asym_omega_h(ii) = 0.5_rk * asym_omega  ! half asymmetric omega
        outState( (q000-1)*nElems+iElem ) = &
          &     f000*(1.0_rk-omega)+ omega*div1_3*rho(ii)*feq_common(ii)

      end do ! iElem
      call obscure_setzero(ii)

      ! 110 and NN0 ----------------------------------------------------------------
?? copy :: dir_vector
      do iLink = (q110-1)*nElems+minElem, (q110-1)*nElems+maxElem
        ii=ii+1
        t2x2 = t2x2_0 * rho(ii)
        fac2 = t2x2 * t2cs4inv !inv2csq2
        ui   = u_x(ii) + u_y(ii)
        sym  =      omega_h(ii) * ( f110(ii) + fNN0(ii) - fac2*ui*ui &
          &  - t2x2*feq_common(ii) )
        asym = asym_omega_h(ii) * ( f110(ii) - fNN0(ii) - 3.d0*t2x2*ui )
        outState(iLink) = f110(ii) - sym - asym
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qNN0-1)*nElems+minElem, (qNN0-1)*nElems+maxElem
        ii=ii+1
        t2x2 = t2x2_0 * rho(ii)
        fac2 = t2x2 * t2cs4inv !inv2csq2
        ui   = u_x(ii) + u_y(ii)
        sym  =      omega_h(ii) * ( f110(ii) + fNN0(ii) - fac2*ui*ui &
          &  - t2x2*feq_common(ii) )
        asym = asym_omega_h(ii) * ( f110(ii) - fNN0(ii) - 3.d0*t2x2*ui )
        outState(iLink) = fNN0(ii) - sym + asym
      end do
      call obscure_setzero(ii)
      ! 110 and NN0 ----------------------------------------------------------------

      ! 1N0 and N10 ----------------------------------------------------------------
?? copy :: dir_vector
      do iLink = (q1N0-1)*nElems+minElem, (q1N0-1)*nElems+maxElem
        ii=ii+1
        t2x2 = t2x2_0 * rho(ii)
        fac2 = t2x2 * t2cs4inv !inv2csq2
        ui   = u_x(ii) - u_y(ii)
        sym  =      omega_h(ii) * ( f1N0(ii) + fN10(ii) - fac2*ui*ui &
          &  - t2x2*feq_common(ii))
        asym = asym_omega_h(ii) * ( f1N0(ii) - fN10(ii) - 3.d0*t2x2*ui )
        outState(iLink) = f1N0(ii) - sym - asym
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qN10-1)*nElems+minElem, (qN10-1)*nElems+maxElem
        ii=ii+1
        t2x2 = t2x2_0 * rho(ii)
        fac2 = t2x2 * t2cs4inv !inv2csq2
        ui   = u_x(ii) - u_y(ii)
        sym  =      omega_h(ii) * ( f1N0(ii) + fN10(ii) - fac2*ui*ui &
          &  - t2x2*feq_common(ii))
        asym = asym_omega_h(ii) * ( f1N0(ii) - fN10(ii) - 3.d0*t2x2*ui )
        outState(iLink) = fN10(ii) - sym + asym
      end do ! iLink
      call obscure_setzero(ii)
      ! ----------------------------------------------------------------

      ! 101 and N0N ----------------------------------------------------------------
?? copy :: dir_vector
      do iLink = (q101-1)*nElems+minElem, (q101-1)*nElems+maxElem
        ii=ii+1
        t2x2 = t2x2_0 * rho(ii)
        fac2 = t2x2 * t2cs4inv !inv2csq2
        ui   = u_x(ii) + u_z(ii)
        sym  =      omega_h(ii) * ( f101(ii) + fN0N(ii) - fac2*ui*ui &
          &  - t2x2*feq_common(ii) )
        asym = asym_omega_h(ii) * ( f101(ii) - fN0N(ii) - 3.d0*t2x2*ui )
        outState(iLink) = f101(ii) - sym - asym
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qN0N-1)*nElems+minElem, (qN0N-1)*nElems+maxElem
        ii=ii+1
        t2x2 = t2x2_0 * rho(ii)
        fac2 = t2x2 * t2cs4inv !inv2csq2
        ui   = u_x(ii) + u_z(ii)
        sym  =      omega_h(ii) * ( f101(ii) + fN0N(ii) - fac2*ui*ui &
          &  - t2x2*feq_common(ii) )
        asym = asym_omega_h(ii) * ( f101(ii) - fN0N(ii) - 3.d0*t2x2*ui )
        outState(iLink) = fN0N(ii) - sym + asym
      end do ! iLink
      call obscure_setzero(ii)
      ! 101 and N0N ----------------------------------------------------------------

      ! 10N and 10N ----------------------------------------------------------------
?? copy :: dir_vector
      do iLink = (q10N-1)*nElems+minElem, (q10N-1)*nElems+maxElem
        ii=ii+1
        t2x2 = t2x2_0 * rho(ii)
        fac2 = t2x2 * t2cs4inv !inv2csq2
        ui   = u_x(ii) - u_z(ii)
        sym  =      omega_h(ii) * ( f10N(ii) + fN01(ii) - fac2*ui*ui &
          &  - t2x2*feq_common(ii) )
        asym = asym_omega_h(ii) * ( f10N(ii) - fN01(ii) - 3.d0*t2x2*ui )
        outState(iLink) = f10N(ii) - sym - asym
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qN01-1)*nElems+minElem, (qN01-1)*nElems+maxElem
        ii=ii+1
        t2x2 = t2x2_0 * rho(ii)
        fac2 = t2x2 * t2cs4inv !inv2csq2
        ui   = u_x(ii) - u_z(ii)
        sym  =      omega_h(ii) * ( f10N(ii) + fN01(ii) - fac2*ui*ui &
          &  - t2x2*feq_common(ii) )
        asym = asym_omega_h(ii) * ( f10N(ii) - fN01(ii) - 3.d0*t2x2*ui )
        outState(iLink) = fN01(ii) - sym + asym
      end do ! iLink
      call obscure_setzero(ii)
      ! 10N and 10N ----------------------------------------------------------------

      ! 011 and 0NN ----------------------------------------------------------------
?? copy :: dir_vector
      do iLink = (q011-1)*nElems+minElem, (q011-1)*nElems+maxElem
        ii=ii+1
        t2x2 = t2x2_0 * rho(ii)
        fac2 = t2x2 * t2cs4inv !inv2csq2
        ui   = u_y(ii) + u_z(ii)
        sym  =      omega_h(ii) * ( f011(ii) + f0NN(ii) - fac2*ui*ui &
          &  - t2x2*feq_common(ii) )
        asym = asym_omega_h(ii) * ( f011(ii) - f0NN(ii) - 3.d0*t2x2*ui )
        outState(iLink) = f011(ii) - sym - asym
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q0NN-1)*nElems+minElem, (q0NN-1)*nElems+maxElem
        ii=ii+1
        t2x2 = t2x2_0 * rho(ii)
        fac2 = t2x2 * t2cs4inv !inv2csq2
        ui   = u_y(ii) + u_z(ii)
        sym  =      omega_h(ii) * ( f011(ii) + f0NN(ii) - fac2*ui*ui &
          &  - t2x2*feq_common(ii) )
        asym = asym_omega_h(ii) * ( f011(ii) - f0NN(ii) - 3.d0*t2x2*ui )
        outState(iLink) = f0NN(ii) - sym + asym
      end do ! iLink
      call obscure_setzero(ii)
      ! 011 and 0NN ----------------------------------------------------------------

      ! 01N and 0N1 ----------------------------------------------------------------
?? copy :: dir_vector
      do iLink = (q01N-1)*nElems+minElem, (q01N-1)*nElems+maxElem
        ii=ii+1
        t2x2 = t2x2_0 * rho(ii)
        fac2 = t2x2 * t2cs4inv !inv2csq2
        ui   = u_y(ii) - u_z(ii)
        sym  =      omega_h(ii) * ( f01N(ii) + f0N1(ii) - fac2*ui*ui &
          &  - t2x2*feq_common(ii) )
        asym = asym_omega_h(ii) * ( f01N(ii) - f0N1(ii) - 3.d0*t2x2*ui )
        outState(iLink) = f01N(ii) - sym - asym
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q0N1-1)*nElems+minElem, (q0N1-1)*nElems+maxElem
        ii=ii+1
        t2x2 = t2x2_0 * rho(ii)
        fac2 = t2x2 * t2cs4inv !inv2csq2
        ui   = u_y(ii) - u_z(ii)
        sym  =      omega_h(ii) * ( f01N(ii) + f0N1(ii) - fac2*ui*ui &
          & - t2x2*feq_common(ii) )
        asym = asym_omega_h(ii) * ( f01N(ii) - f0N1(ii) - 3.d0*t2x2*ui )
        outState(iLink) = f0N1(ii) - sym + asym
      end do ! iLink
      call obscure_setzero(ii)
      ! 01N and 0N1 ----------------------------------------------------------------

      ! 100 and N00 ----------------------------------------------------------------
?? copy :: dir_vector
      do iLink = (q100-1)*nElems+minElem, (q100-1)*nElems+maxElem
        ii=ii+1
        t1x2 = t1x2_0 * rho(ii)
        fac1 = t1x2 * t2cs4inv !inv2csq2
        sym  =      omega_h(ii) * ( f100(ii) + fN00(ii) - fac1*u_x(ii)*u_x(ii) &
          &  -t1x2*feq_common(ii))
        asym = asym_omega_h(ii) * ( f100(ii) - fN00(ii) - 3.d0*t1x2*u_x(ii) )
        outState(iLink) = f100(ii) - sym - asym
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (qN00-1)*nElems+minElem, (qN00-1)*nElems+maxElem
        ii=ii+1
        t1x2 = t1x2_0 * rho(ii)
        fac1 = t1x2 * t2cs4inv !inv2csq2
        sym  =      omega_h(ii) * ( f100(ii) + fN00(ii) - fac1*u_x(ii)*u_x(ii) &
          &  -t1x2*feq_common(ii))
        asym = asym_omega_h(ii) * ( f100(ii) - fN00(ii) - 3.d0*t1x2*u_x(ii) )
        outState(iLink) = fN00(ii) - sym + asym
      end do ! iLink
      call obscure_setzero(ii)
      ! 100 and N00 ----------------------------------------------------------------

      ! 010 and 0N0 ----------------------------------------------------------------
?? copy :: dir_vector
      do iLink = (q010-1)*nElems+minElem, (q010-1)*nElems+maxElem
        ii=ii+1
        t1x2 = t1x2_0 * rho(ii)
        fac1 = t1x2 * t2cs4inv !inv2csq2
        sym  =      omega_h(ii) * ( f010(ii) + f0N0(ii) - fac1*u_y(ii)*u_y(ii) &
          &  -t1x2*feq_common(ii))
        asym = asym_omega_h(ii) * ( f010(ii) - f0N0(ii) - 3.d0*t1x2*u_y(ii) )
        outState(iLink) = f010(ii) - sym - asym
      end do ! iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q0N0-1)*nElems+minElem, (q0N0-1)*nElems+maxElem
        ii=ii+1
        t1x2 = t1x2_0 * rho(ii)
        fac1 = t1x2 * t2cs4inv !inv2csq2
        sym  =      omega_h(ii) * ( f010(ii) + f0N0(ii) - fac1*u_y(ii)*u_y(ii) &
          &  -t1x2*feq_common(ii))
        asym = asym_omega_h(ii) * ( f010(ii) - f0N0(ii) - 3.d0*t1x2*u_y(ii) )
        outState(iLink) = f0N0(ii) - sym + asym
      end do ! iLink
      call obscure_setzero(ii)
      ! 010 and 0N0 ----------------------------------------------------------------

      ! 001 and 00N ----------------------------------------------------------------
?? copy :: dir_vector
      do iLink = (q001-1)*nElems+minElem, (q001-1)*nElems+maxElem
        ii=ii+1
        t1x2 = t1x2_0 * rho(ii)
        fac1 = t1x2 * t2cs4inv !inv2csq2
        sym  =      omega_h(ii) * ( f001(ii) + f00N(ii) - fac1*u_z(ii)*u_z(ii) &
          &  -t1x2*feq_common(ii))
        asym = asym_omega_h(ii) * ( f001(ii) - f00N(ii) - 3.d0*t1x2*u_z(ii) )
        outState(iLink) = f001(ii) - sym - asym
      end do !iLink
      call obscure_setzero(ii)

?? copy :: dir_vector
      do iLink = (q00N-1)*nElems+minElem, (q00N-1)*nElems+maxElem
        ii=ii+1
        t1x2 = t1x2_0 * rho(ii)
        fac1 = t1x2 * t2cs4inv !inv2csq2
        sym  =      omega_h(ii) * ( f001(ii) + f00N(ii) - fac1*u_z(ii)*u_z(ii) &
          &  -t1x2*feq_common(ii))
        asym = asym_omega_h(ii) * ( f001(ii) - f00N(ii) - 3.d0*t1x2*u_z(ii) )
        outState(iLink) = f00N(ii) - sym + asym
      end do ! iLink
      call obscure_setzero(ii)
      ! 001 and 00N ----------------------------------------------------------------

    enddo nodeloop
!$omp end do nowait

?? ELSE
    ! ---------------------------------------------------------------------------
    integer :: iElem
?? copy :: pdfTmp19( f )
    real(kind=rk) :: rho
    real(kind=rk) :: u_x, u_y, u_z, usq
    real(kind=rk) :: omega, omega_h, asym_omega, asym_omega_h
    real(kind=rk) :: ui
    real(kind=rk) :: asym, sym, feq_common
    real(kind=rk) :: t1x2,t2x2,fac1,fac2
    real(kind=rk), parameter :: t1x2_0 = 1.d0/18.d0 * 2.d0
    real(kind=rk), parameter :: t2x2_0 = 1.d0/36.d0 * 2.d0
    integer :: dens_pos, vel_pos(3), elemOff
    ! ---------------------------------------------------------------------------
    dens_pos = varSys%method%val(derVarPos(1)%density)%auxField_varPos(1)
    vel_pos = varSys%method%val(derVarPos(1)%velocity)%auxField_varPos(1:3)

!$omp do schedule(static)

?? copy :: dir_novec
    nodeloop: do iElem=1,nSolve
      ! First load all local values into temp array
      fN00 = inState(?FETCH?( qN00, 1, iElem, QQ, QQ, nElems,neigh ))
      f0N0 = inState(?FETCH?( q0N0, 1, iElem, QQ, QQ, nElems,neigh ))
      f00N = inState(?FETCH?( q00N, 1, iElem, QQ, QQ, nElems,neigh ))
      f100 = inState(?FETCH?( q100, 1, iElem, QQ, QQ, nElems,neigh ))
      f010 = inState(?FETCH?( q010, 1, iElem, QQ, QQ, nElems,neigh ))
      f001 = inState(?FETCH?( q001, 1, iElem, QQ, QQ, nElems,neigh ))
      f0NN = inState(?FETCH?( q0NN, 1, iElem, QQ, QQ, nElems,neigh ))
      f0N1 = inState(?FETCH?( q0N1, 1, iElem, QQ, QQ, nElems,neigh ))
      f01N = inState(?FETCH?( q01N, 1, iElem, QQ, QQ, nElems,neigh ))
      f011 = inState(?FETCH?( q011, 1, iElem, QQ, QQ, nElems,neigh ))
      fN0N = inState(?FETCH?( qN0N, 1, iElem, QQ, QQ, nElems,neigh ))
      f10N = inState(?FETCH?( q10N, 1, iElem, QQ, QQ, nElems,neigh ))
      fN01 = inState(?FETCH?( qN01, 1, iElem, QQ, QQ, nElems,neigh ))
      f101 = inState(?FETCH?( q101, 1, iElem, QQ, QQ, nElems,neigh ))
      fNN0 = inState(?FETCH?( qNN0, 1, iElem, QQ, QQ, nElems,neigh ))
      fN10 = inState(?FETCH?( qN10, 1, iElem, QQ, QQ, nElems,neigh ))
      f1N0 = inState(?FETCH?( q1N0, 1, iElem, QQ, QQ, nElems,neigh ))
      f110 = inState(?FETCH?( q110, 1, iElem, QQ, QQ, nElems,neigh ))
      f000 = inState(?FETCH?( q000, 1, iElem, QQ, QQ, nElems,neigh ))

      ! element offset for auxField array
      elemOff = (iElem-1)*varSys%nAuxScalars
      ! local density
      rho = auxField(elemOff + dens_pos)
      ! local x-, y- and z-velocity
      u_x = auxField(elemOff + vel_pos(1))
      u_y = auxField(elemOff + vel_pos(2))
      u_z = auxField(elemOff + vel_pos(3))

      ! square velocity and derived constants
      usq  = u_x*u_x + u_y*u_y + u_z*u_z
      feq_common = 1.d0 - 1.5d0 * usq

      omega = fieldProp(1)%fluid%viscKine%omLvl(level)%val(iElem)
      omega_h = 0.5_rk * omega ! half omega

      asym_omega = 1.0_rk / ( 0.5_rk + fieldProp(1)%fluid%lambda &
        &                   / ( 1.0_rk/omega - 0.5_rk )          )
      asym_omega_h = 0.5_rk * asym_omega  ! half asymmetric omega

      ! let's start the relaxation process
      outstate(?SAVE?(q000, 1, iElem, QQ, QQ, nElems,neigh)) =                 &
        &                      f000*(1.0_rk-omega) + omega*div1_3*rho*feq_common

      t2x2 = t2x2_0 * rho
      fac2 = t2x2 * t2cs4inv !inv2csq2

      ui   = u_x + u_y
      sym  =      omega_h * ( f110 + fNN0 - fac2*ui*ui -t2x2*feq_common )
      asym = asym_omega_h * ( f110 - fNN0 - 3.d0*t2x2*ui )
      outstate(?SAVE?(q110, 1, iElem, QQ, QQ, nElems,neigh)) = f110 - sym - asym
      outstate(?SAVE?(qNN0, 1, iElem, QQ, QQ, nElems,neigh)) = fNN0 - sym + asym

      ui   = u_x - u_y
      sym  =      omega_h * ( f1N0 + fN10 - fac2*ui*ui -t2x2*feq_common)
      asym = asym_omega_h * ( f1N0 - fN10 - 3.d0*t2x2*ui )
      outstate(?SAVE?(q1N0, 1, iElem, QQ, QQ, nElems,neigh)) = f1N0 - sym - asym
      outstate(?SAVE?(qN10, 1, iElem, QQ, QQ, nElems,neigh)) = fN10 - sym + asym

      ui   = u_x + u_z
      sym  =      omega_h * ( f101 + fN0N - fac2*ui*ui -t2x2*feq_common)
      asym = asym_omega_h * ( f101 - fN0N - 3.d0*t2x2*ui )
      outstate(?SAVE?(q101, 1, iElem, QQ, QQ, nElems,neigh)) = f101 - sym - asym
      outstate(?SAVE?(qN0N, 1, iElem, QQ, QQ, nElems,neigh)) = fN0N - sym + asym

      ui   = u_x - u_z
      sym  =      omega_h * ( f10N + fN01 - fac2*ui*ui -t2x2*feq_common)
      asym = asym_omega_h * ( f10N - fN01 - 3.d0*t2x2*ui )
      outstate(?SAVE?(q10N, 1, iElem, QQ, QQ, nElems,neigh)) = f10N - sym - asym
      outstate(?SAVE?(qN01, 1, iElem, QQ, QQ, nElems,neigh)) = fN01 - sym + asym

      ui   = u_y + u_z
      sym  =      omega_h * ( f011 + f0NN - fac2*ui*ui -t2x2*feq_common)
      asym = asym_omega_h * ( f011 - f0NN - 3.d0*t2x2*ui )
      outstate(?SAVE?(q011, 1, iElem, QQ, QQ, nElems,neigh)) = f011 - sym - asym
      outstate(?SAVE?(q0NN, 1, iElem, QQ, QQ, nElems,neigh)) = f0NN - sym + asym

      ui   = u_y - u_z
      sym  =      omega_h * ( f01N + f0N1 - fac2*ui*ui -t2x2*feq_common)
      asym = asym_omega_h * ( f01N - f0N1 - 3.d0*t2x2*ui )
      outstate(?SAVE?(q01N, 1, iElem, QQ, QQ, nElems,neigh)) = f01N - sym - asym
      outstate(?SAVE?(q0N1, 1, iElem, QQ, QQ, nElems,neigh)) = f0N1 - sym + asym

      t1x2 = t1x2_0 * rho
      fac1 = t1x2 * t2cs4inv !inv2csq2

      sym  =      omega_h * ( f100 + fN00 - fac1*u_x*u_x-t1x2*feq_common)
      asym = asym_omega_h * ( f100 - fN00 - 3.d0*t1x2*u_x )
      outstate(?SAVE?(q100, 1, iElem, QQ, QQ, nElems,neigh)) = f100 - sym - asym
      outstate(?SAVE?(qN00, 1, iElem, QQ, QQ, nElems,neigh)) = fN00 - sym + asym

      sym  =      omega_h * ( f010 + f0N0 - fac1*u_y*u_y-t1x2*feq_common)
      asym = asym_omega_h * ( f010 - f0N0 - 3.d0*t1x2*u_y )
      outstate(?SAVE?(q010, 1, iElem, QQ, QQ, nElems,neigh)) = f010 - sym - asym
      outstate(?SAVE?(q0N0, 1, iElem, QQ, QQ, nElems,neigh)) = f0N0 - sym + asym

      sym  =      omega_h * ( f001 + f00N - fac1*u_z*u_z-t1x2*feq_common)
      asym = asym_omega_h * ( f001 - f00N - 3.d0*t1x2*u_z )
      outstate(?SAVE?(q001, 1, iElem, QQ, QQ, nElems,neigh)) = f001 - sym - asym
      outstate(?SAVE?(q00N, 1, iElem, QQ, QQ, nElems,neigh)) = f00N - sym + asym
    enddo nodeloop
!$omp end do

?? ENDIF

  end subroutine trt_advRel_d3q19
! ****************************************************************************** !


! ****************************************************************************** !
  !> Advection relaxation routine for the D3Q19 model with TRT collision
  !! operator.
  !!
  !! Incompressible model
  !!
  !! This subroutine interface must match the abstract interface definition
  !! [[kernel]] in scheme/[[mus_scheme_type_module]].f90 in order to be callable
  !! via [[mus_scheme_type:compute]] function pointer.
  subroutine trt_advRel_d3q19_incomp( fieldProp, inState, outState, auxField, &
    &                                 neigh, nElems, nSolve, level, layout,   &
    &                                 params, varSys, derVarPos               )
    ! -------------------------------------------------------------------- !
    !> Array of field properties (fluid or species)
    type(mus_field_prop_type), intent(in) :: fieldProp(:)
    !> variable system definition
    type(tem_varSys_type), intent(in) :: varSys
    !> current layout
    type(mus_scheme_layout_type), intent(in) :: layout
    !> number of elements in state Array
    integer, intent(in) :: nElems
    !> input  pdf vector
    real(kind=rk), intent(in)  ::  inState(nElems * varSys%nScalars)
    !> output pdf vector
    real(kind=rk), intent(out) :: outState(nElems * varSys%nScalars)
    !> Auxiliary field computed from pre-collision state
    !! Is updated with correct velocity field for multicomponent models
    real(kind=rk), intent(inout) :: auxField(nElems * varSys%nAuxScalars)
    !> connectivity vector
    integer, intent(in) :: neigh(nElems * layout%fStencil%QQ)
    !> number of elements solved in kernel
    integer, intent(in) :: nSolve
    !> current level
    integer,intent(in) :: level
    !> global parameters
    type(mus_param_type),intent(in) :: params
    !> position of derived quantities in varsys for all fields
    type( mus_derVarPos_type ), intent(in) :: derVarPos(:)
    ! -------------------------------------------------------------------- !
    integer :: iElem ! element counter
?? copy :: pdfTmp19( f )
    real(kind=rk) :: rho, u_x, u_y, u_z, usq
    real(kind=rk) :: omega, omega_h, asym_omega, asym_omega_h
    real(kind=rk) :: ui
    real(kind=rk) :: asym, sym, feq_common, t1_feq, t2_feq
    real(kind=rk), parameter :: t1x2 = 1.d0/ 9.d0
    real(kind=rk), parameter :: t2x2 = 1.d0/18.d0
    real(kind=rk), parameter :: fac1 = t1x2*t2cs4inv !inv2csq2
    real(kind=rk), parameter :: fac2 = t2x2*t2cs4inv !inv2csq2
    integer :: dens_pos, vel_pos(3), elemOff
    ! ---------------------------------------------------------------------------
    dens_pos = varSys%method%val(derVarPos(1)%density)%auxField_varPos(1)
    vel_pos = varSys%method%val(derVarPos(1)%velocity)%auxField_varPos(1:3)

!$omp do schedule(static)

?? copy :: dir_novec
    nodeloop: do iElem = 1, nSolve
      ! First load all local values into temp array
      fN00 = inState(?FETCH?( qN00, 1, iElem, QQ, QQ, nElems,neigh ))
      f0N0 = inState(?FETCH?( q0N0, 1, iElem, QQ, QQ, nElems,neigh ))
      f00N = inState(?FETCH?( q00N, 1, iElem, QQ, QQ, nElems,neigh ))
      f100 = inState(?FETCH?( q100, 1, iElem, QQ, QQ, nElems,neigh ))
      f010 = inState(?FETCH?( q010, 1, iElem, QQ, QQ, nElems,neigh ))
      f001 = inState(?FETCH?( q001, 1, iElem, QQ, QQ, nElems,neigh ))
      f0NN = inState(?FETCH?( q0NN, 1, iElem, QQ, QQ, nElems,neigh ))
      f0N1 = inState(?FETCH?( q0N1, 1, iElem, QQ, QQ, nElems,neigh ))
      f01N = inState(?FETCH?( q01N, 1, iElem, QQ, QQ, nElems,neigh ))
      f011 = inState(?FETCH?( q011, 1, iElem, QQ, QQ, nElems,neigh ))
      fN0N = inState(?FETCH?( qN0N, 1, iElem, QQ, QQ, nElems,neigh ))
      f10N = inState(?FETCH?( q10N, 1, iElem, QQ, QQ, nElems,neigh ))
      fN01 = inState(?FETCH?( qN01, 1, iElem, QQ, QQ, nElems,neigh ))
      f101 = inState(?FETCH?( q101, 1, iElem, QQ, QQ, nElems,neigh ))
      fNN0 = inState(?FETCH?( qNN0, 1, iElem, QQ, QQ, nElems,neigh ))
      fN10 = inState(?FETCH?( qN10, 1, iElem, QQ, QQ, nElems,neigh ))
      f1N0 = inState(?FETCH?( q1N0, 1, iElem, QQ, QQ, nElems,neigh ))
      f110 = inState(?FETCH?( q110, 1, iElem, QQ, QQ, nElems,neigh ))
      f000 = inState(?FETCH?( q000, 1, iElem, QQ, QQ, nElems,neigh ))

      ! element offset for auxField array
      elemOff = (iElem-1)*varSys%nAuxScalars
      ! local density
      rho = auxField(elemOff + dens_pos)
      ! local x-, y- and z-velocity
      u_x = auxField(elemOff + vel_pos(1))
      u_y = auxField(elemOff + vel_pos(2))
      u_z = auxField(elemOff + vel_pos(3))

      ! square velocity and derived constants
      usq  = u_x*u_x + u_y*u_y + u_z*u_z
      feq_common = rho - 1.5d0 * usq

      omega = fieldProp(1)%fluid%viscKine%omLvl(level)%val(iElem)
      omega_h = 0.5_rk * omega ! half omega

      asym_omega = 1.0_rk / ( 0.5_rk + fieldProp(1)%fluid%lambda &
        &                   / ( 1.0_rk/omega - 0.5_rk )          )
      asym_omega_h = 0.5_rk * asym_omega  ! half asymmetric omega

      ! let's start the relaxation process
      outstate(?SAVE?(q000,1,iElem,QQ,QQ,nElems,neigh)) = f000*(1.d0-omega) &
        & + omega*div1_3*feq_common

      ! t2x2 = 1.d0/18.d0
      t2_feq = t2x2*feq_common

      ui   = u_x + u_y
      sym  =      omega_h*( f110 + fNN0 - fac2*ui*ui-t2_feq)
      asym = asym_omega_h*( f110 - fNN0 - div1_6*ui )
      outstate(?SAVE?(q110,1,iElem,QQ,QQ,nElems,neigh)) = f110 - sym - asym
      outstate(?SAVE?(qNN0,1,iElem,QQ,QQ,nElems,neigh)) = fNN0 - sym + asym

      ui   = u_x - u_y
      sym  =      omega_h*( f1N0 + fN10 - fac2*ui*ui-t2_feq)
      asym = asym_omega_h*( f1N0 - fN10 - div1_6*ui )
      outstate(?SAVE?(q1N0,1,iElem,QQ,QQ,nElems,neigh)) = f1N0 - sym - asym
      outstate(?SAVE?(qN10,1,iElem,QQ,QQ,nElems,neigh)) = fN10 - sym + asym

      ui   = u_x + u_z
      sym  =      omega_h*( f101 + fN0N - fac2*ui*ui-t2_feq)
      asym = asym_omega_h*( f101 - fN0N - div1_6*ui )
      outstate(?SAVE?(q101,1,iElem,QQ,QQ,nElems,neigh)) = f101 - sym - asym
      outstate(?SAVE?(qN0N,1,iElem,QQ,QQ,nElems,neigh)) = fN0N - sym + asym

      ui   = u_x - u_z
      sym  =      omega_h*( f10N + fN01 - fac2*ui*ui-t2_feq)
      asym = asym_omega_h*( f10N - fN01 - div1_6*ui )
      outstate(?SAVE?(q10N,1,iElem,QQ,QQ,nElems,neigh)) = f10N - sym - asym
      outstate(?SAVE?(qN01,1,iElem,QQ,QQ,nElems,neigh)) = fN01 - sym + asym

      ui   = u_y + u_z
      sym  =      omega_h*( f011 + f0NN - fac2*ui*ui-t2_feq)
      asym = asym_omega_h*( f011 - f0NN - div1_6*ui )
      outstate(?SAVE?(q011,1,iElem,QQ,QQ,nElems,neigh)) = f011 - sym - asym
      outstate(?SAVE?(q0NN,1,iElem,QQ,QQ,nElems,neigh)) = f0NN - sym + asym

      ui   = u_y - u_z
      sym  =      omega_h*( f01N + f0N1 - fac2*ui*ui-t2_feq)
      asym = asym_omega_h*( f01N - f0N1 - div1_6*ui )
      outstate(?SAVE?(q01N,1,iElem,QQ,QQ,nElems,neigh)) = f01N - sym - asym
      outstate(?SAVE?(q0N1,1,iElem,QQ,QQ,nElems,neigh)) = f0N1 - sym + asym

      ! t1x2 = 1.d0/ 9.d0
      t1_feq = t1x2*feq_common

      ! ui   = u_y
      sym  =      omega_h*( f010 + f0N0 - fac1*u_y*u_y-t1_feq)
      asym = asym_omega_h*( f010 - f0N0 - div1_3*u_y )
      outstate(?SAVE?(q010,1,iElem,QQ,QQ,nElems,neigh)) = f010 - sym - asym
      outstate(?SAVE?(q0N0,1,iElem,QQ,QQ,nElems,neigh)) = f0N0 - sym + asym

      ! ui   = u_x
      sym  =      omega_h*( f100 + fN00 - fac1*u_x*u_x-t1_feq)
      asym = asym_omega_h*( f100 - fN00 - div1_3*u_x )
      outstate(?SAVE?(q100,1,iElem,QQ,QQ,nElems,neigh)) = f100 - sym - asym
      outstate(?SAVE?(qN00,1,iElem,QQ,QQ,nElems,neigh)) = fN00 - sym + asym

      ! ui   = u_z
      sym  =      omega_h*( f001 + f00N - fac1*u_z*u_z-t1_feq)
      asym = asym_omega_h*( f001 - f00N - div1_3*u_z )
      outstate(?SAVE?(q001,1,iElem,QQ,QQ,nElems,neigh)) = f001 - sym - asym
      outstate(?SAVE?(q00N,1,iElem,QQ,QQ,nElems,neigh)) = f00N - sym + asym
    enddo nodeloop

  end subroutine trt_advRel_d3q19_incomp
! ****************************************************************************** !

?? IF (SOA) THEN
! ****************************************************************************** !
  subroutine obscure_setzero(ii)
    integer :: ii
    ii = 0
  end subroutine obscure_setzero
! ****************************************************************************** !
?? ENDIF

! ****************************************************************************** !
  !> No comment yet!
  !!
  !! TODO add comment
  !!
  !! This subroutine interface must match the abstract interface definition
  !! [[kernel]] in scheme/[[mus_scheme_type_module]].f90 in order to be callable
  !! via [[mus_scheme_type:compute]] function pointer.
  subroutine bgk_advRel_d3q19_block( fieldProp, inState, outState, auxField, &
    &                                neigh, nElems, nSolve, level, layout,   &
    &                                params, varSys, derVarPos               )
    ! -------------------------------------------------------------------- !
    !> Array of field properties (fluid or species)
    type(mus_field_prop_type), intent(in) :: fieldProp(:)
    !> variable system definition
    type(tem_varSys_type), intent(in) :: varSys
    !> current layout
    type(mus_scheme_layout_type), intent(in) :: layout
    !> number of elements in state Array
    integer, intent(in) :: nElems
    !> input  pdf vector
    real(kind=rk), intent(in)  ::  inState(nElems * varSys%nScalars)
    !> output pdf vector
    real(kind=rk), intent(out) :: outState(nElems * varSys%nScalars)
    !> Auxiliary field computed from pre-collision state
    !! Is updated with correct velocity field for multicomponent models
    real(kind=rk), intent(inout) :: auxField(nElems * varSys%nAuxScalars)
    !> connectivity vector
    integer, intent(in) :: neigh(nElems * layout%fStencil%QQ)
    !> number of elements solved in kernel
    integer, intent(in) :: nSolve
    !> current level
    integer,intent(in) :: level
    !> global parameters
    type(mus_param_type),intent(in) :: params
    !> position of derived quantities in varsys for all fields
    type( mus_derVarPos_type ), intent(in) :: derVarPos(:)
    ! -------------------------------------------------------------------- !
    integer :: iElem, nScalars, minElem, maxElem
?? copy :: pdfTmp19( f )
    real(kind=rk) :: rho     ! local density
    real(kind=rk) :: u_x     ! local x-velocity
    real(kind=rk) :: u_y     ! local y-velocity
    real(kind=rk) :: u_z     ! local z-velocity
    real(kind=rk) :: usq     ! square velocity
    ! derived constants
    real(kind=rk) :: usqn, usqn_o1, usqn_o2
    real(kind=rk) :: cmpl_o, omega
    real(kind=rk) :: coeff_1, coeff_2
    real(kind=rk) :: ui1, ui3, ui10, ui11, ui12, ui13
    real(kind=rk) :: fac_1, fac_2, fac_3, fac_4, fac_9, fac_10, fac_11, fac_12,&
      &              fac_13
    real(kind=rk) :: sum1_1, sum1_2, sum2_1, sum2_2, sum3_1, sum3_2, sum4_1,   &
      &              sum4_2, sum9_1, sum9_2, sum10_1, sum10_2, sum11_1,        &
      &              sum11_2, sum12_1, sum12_2, sum13_1, sum13_2
    integer :: dens_pos, vel_pos(3), elemOff
    ! ---------------------------------------------------------------------------
    dens_pos = varSys%method%val(derVarPos(1)%density)%auxField_varPos(1)
    vel_pos = varSys%method%val(derVarPos(1)%velocity)%auxField_varPos(1:3)

    nScalars = varSys%nScalars

!$omp do schedule(static)
    do minElem = 1, nSolve, vlen

      maxElem = min( minElem + vlen - 1, nSolve )

      !NEC$ ivdep
      !NEC$ shortloop
      do iElem = minElem, maxElem
        ! First load all local values into temp array
        fN00 = inState(?FETCH?( qN00, 1, iElem, QQ, nScalars, nElems,neigh ))
        f0N0 = inState(?FETCH?( q0N0, 1, iElem, QQ, nScalars, nElems,neigh ))
        f00N = inState(?FETCH?( q00N, 1, iElem, QQ, nScalars, nElems,neigh ))
        f100 = inState(?FETCH?( q100, 1, iElem, QQ, nScalars, nElems,neigh ))
        f010 = inState(?FETCH?( q010, 1, iElem, QQ, nScalars, nElems,neigh ))
        f001 = inState(?FETCH?( q001, 1, iElem, QQ, nScalars, nElems,neigh ))
        f0NN = inState(?FETCH?( q0NN, 1, iElem, QQ, nScalars, nElems,neigh ))
        f0N1 = inState(?FETCH?( q0N1, 1, iElem, QQ, nScalars, nElems,neigh ))
        f01N = inState(?FETCH?( q01N, 1, iElem, QQ, nScalars, nElems,neigh ))
        f011 = inState(?FETCH?( q011, 1, iElem, QQ, nScalars, nElems,neigh ))
        fN0N = inState(?FETCH?( qN0N, 1, iElem, QQ, nScalars, nElems,neigh ))
        f10N = inState(?FETCH?( q10N, 1, iElem, QQ, nScalars, nElems,neigh ))
        fN01 = inState(?FETCH?( qN01, 1, iElem, QQ, nScalars, nElems,neigh ))
        f101 = inState(?FETCH?( q101, 1, iElem, QQ, nScalars, nElems,neigh ))
        fNN0 = inState(?FETCH?( qNN0, 1, iElem, QQ, nScalars, nElems,neigh ))
        fN10 = inState(?FETCH?( qN10, 1, iElem, QQ, nScalars, nElems,neigh ))
        f1N0 = inState(?FETCH?( q1N0, 1, iElem, QQ, nScalars, nElems,neigh ))
        f110 = inState(?FETCH?( q110, 1, iElem, QQ, nScalars, nElems,neigh ))
        f000 = inState(?FETCH?( q000, 1, iElem, QQ, nScalars, nElems,neigh ))

        ! element offset for auxField array
        elemOff = (iElem-1)*varSys%nAuxScalars
        ! local density
        rho = auxField(elemOff + dens_pos)
        ! local x-, y- and z-velocity
        u_x = auxField(elemOff + vel_pos(1))
        u_y = auxField(elemOff + vel_pos(2))
        u_z = auxField(elemOff + vel_pos(3))

        ! square velocity and derived constants
        usq  = u_x*u_x + u_y*u_y + u_z*u_z
        usqn = div1_36 * (1._rk - 1.5_rk * usq) * rho

        ! read the relaxation parameter omega for the current level
        omega   = fieldProp(1)%fluid%viscKine%omLvl(level)%val(iElem)
        ! pre-calculate partial collision constants
        cmpl_o  = 1._rk - omega

        ! f = (1-w) * f + w * fEq
        outState(?SAVE?(q000,1, iElem, QQ, nScalars,nElems,neigh)) =    &
          &                     f000*cmpl_o+omega*rho*(div1_3-0.5_rk*usq)

        coeff_1 = div1_8 * omega * rho

        usqn_o1 = omega * usqn

        ui1     =  u_x + u_y
        fac_1   = coeff_1 * ui1
        sum1_1  = fac_1 * div3_4h
        sum1_2  = fac_1 * ui1 + usqn_o1

        outState(?SAVE?(q110,1, iElem, QQ, nScalars,nElems,neigh)) =    &
          &                                   f110 *cmpl_o+sum1_1 +sum1_2
        outState(?SAVE?(qNN0,1, iElem, QQ, nScalars,nElems,neigh)) =    &
          &                                   fNN0 *cmpl_o-sum1_1 +sum1_2

        ui3     = -u_x + u_y
        fac_3   = coeff_1 * ui3
        sum3_1  = fac_3 * div3_4h
        sum3_2  = fac_3 * ui3 + usqn_o1

        outState(?SAVE?(qN10,1, iElem, QQ, nScalars,nElems,neigh)) =    &
          &                                   fN10 *cmpl_o+sum3_1 +sum3_2
        outState(?SAVE?(q1N0,1, iElem, QQ, nScalars,nElems,neigh)) =    &
          &                                   f1N0 *cmpl_o-sum3_1 +sum3_2

        ui10    =  u_x + u_z
        fac_10  = coeff_1 * ui10
        sum10_1 = fac_10 * div3_4h
        sum10_2 = fac_10 * ui10 + usqn_o1

        outState(?SAVE?(q101,1, iElem, QQ, nScalars,nElems,neigh)) =    &
          &                                   f101*cmpl_o+sum10_1+sum10_2
        outState(?SAVE?(qN0N,1, iElem, QQ, nScalars,nElems,neigh)) =    &
          &                                   fN0N*cmpl_o-sum10_1+sum10_2

        ui12    = -u_x + u_z
        fac_12  = coeff_1 * ui12
        sum12_1 = fac_12 * div3_4h
        sum12_2 = fac_12 * ui12 + usqn_o1

        outState(?SAVE?(qN01,1, iElem, QQ, nScalars,nElems,neigh)) =    &
          &                                   fN01*cmpl_o+sum12_1+sum12_2
        outState(?SAVE?(q10N,1, iElem, QQ, nScalars,nElems,neigh)) =    &
          &                                   f10N*cmpl_o-sum12_1+sum12_2

        ui11    =  u_y + u_z
        fac_11  = coeff_1 * ui11
        sum11_1 = fac_11 * div3_4h
        sum11_2 = fac_11 * ui11 + usqn_o1

        outState(?SAVE?(q011,1, iElem, QQ, nScalars,nElems,neigh)) =    &
          &                                   f011*cmpl_o+sum11_1+sum11_2
        outState(?SAVE?(q0NN,1, iElem, QQ, nScalars,nElems,neigh)) =    &
          &                                   f0NN*cmpl_o-sum11_1+sum11_2

        ui13    = -u_y + u_z
        fac_13  = coeff_1 * ui13
        sum13_1 = fac_13 * div3_4h
        sum13_2 = fac_13 * ui13 + usqn_o1

        outState(?SAVE?(q0N1,1, iElem, QQ, nScalars,nElems,neigh)) =    &
          &                                   f0N1*cmpl_o+sum13_1+sum13_2
        outState(?SAVE?(q01N,1, iElem, QQ, nScalars,nElems,neigh)) =    &
          &                                   f01N*cmpl_o-sum13_1+sum13_2

        coeff_2 = div1_8 * omega * 2.0_rk * rho
        usqn_o2 = omega * 2.0_rk * usqn

        fac_2   = coeff_2 * u_y
        sum2_1  = fac_2 * div3_4h
        sum2_2  = fac_2 * u_y + usqn_o2

        outState(?SAVE?(q010,1, iElem, QQ, nScalars,nElems,neigh )) =   &
          &                                   f010 *cmpl_o+sum2_1 +sum2_2
        outState(?SAVE?(q0N0,1, iElem, QQ, nScalars,nElems,neigh )) =   &
          &                                   f0N0 *cmpl_o-sum2_1 +sum2_2

        fac_4   = coeff_2 * u_x
        sum4_1  = fac_4 * div3_4h
        sum4_2  = fac_4 * u_x + usqn_o2

        outState(?SAVE?(qN00,1, iElem, QQ, nScalars,nElems,neigh )) =   &
          &                                   fN00 *cmpl_o-sum4_1 +sum4_2
        outState(?SAVE?(q100,1, iElem, QQ, nScalars,nElems,neigh )) =   &
          &                                   f100 *cmpl_o+sum4_1 +sum4_2

        fac_9   = coeff_2 * u_z
        sum9_1  = fac_9 * div3_4h
        sum9_2  = fac_9 * u_z + usqn_o2

        outState(?SAVE?(q001,1, iElem, QQ, nScalars,nElems,neigh )) =   &
          &                                   f001 *cmpl_o+sum9_1 +sum9_2
        outState(?SAVE?(q00N,1, iElem, QQ, nScalars,nElems,neigh )) =   &
          &                                   f00N *cmpl_o-sum9_1 +sum9_2

      end do
    end do
!$omp end do nowait

  end subroutine bgk_advRel_d3q19_block
! ****************************************************************************** !

end module mus_d3q19_module
! ****************************************************************************** !
