Using MPI_Isend and MPI_Irecv, overlapped communication and computation

Signed-off-by: Gaurav Kukreja <gmkukreja@gmail.com>

Using MPI_Isend and MPI_Irecv, overlapped communication and computation
Signed-off-by: Gaurav Kukreja <gmkukreja@gmail.com>
af6de788 · Gaurav Kukreja · dfbe3c4e · af6de788 · af6de788
Commit af6de788 authored Jan 19, 2014 by Gaurav Kukreja
Hide whitespace changes
Inline Side-by-side

Showing with 612 additions and 6 deletions

SWE_WavePropagationBlock.cpp src/blocks/SWE_WavePropagationBlock.cpp +428 -0

swe_mpi.cpp src/examples/swe_mpi.cpp +184 -6

No files found.
--- a/src/blocks/SWE_WavePropagationBlock.cpp
+++ b/src/blocks/SWE_WavePropagationBlock.cpp
@@ -380,6 +380,434 @@ SWE_WavePropagationBlock::computeNumericalFluxes ()
 #endif
 }

+/**
+ * Compute net updates for the block.
+ * The member variable #maxTimestep will be updated with the 
+ * maximum allowed time step size
+ */
+void
+SWE_WavePropagationBlock::computeNumericalFluxes_innerBlock ()
+{
+#ifdef COUNTFLOPS
+#ifdef LOOP_OPENMP
+	const double time_begin = omp_get_wtime();
+#else
+	const double time_begin = clock();
+#endif
+#endif
+	//maximum (linearized) wave speed within one iteration
+	float maxWaveSpeed = (float) 0.;
+
+	// compute the loop limits
+	// const int end_ny_1_1 = ny + 1;
+	const int end_ny_1_1 = ny;
+	// const int end_ny_1_2 = ny + 2;
+	const int end_ny_1_2 = ny + 1;
+
+	/***************************************************************************************
+	 * compute the net-updates for the vertical edges
+	 **************************************************************************************/
+
+#ifdef LOOP_OPENMP
+#pragma omp parallel
+#endif // LOOP_OPENMP
+	{
+
+		float l_maxWaveSpeed = (float) 0.;
+#if WAVE_PROPAGATION_SOLVER==4
+		solver::FWaveVec<float> wavePropagationSolver;
+#else // WAVE_PROPAGATION_SOLVER==4
+		solver::AugRie_SIMD wavePropagationSolver;
+#endif // WAVE_PROPAGATION_SOLVER==4
+
+#ifdef LOOP_OPENMP
+		// Use OpenMP for the outer loop
+		#pragma omp for schedule(static) nowait
+#endif // LOOP_OPENMP
+		for (int i = 2; i < nx; i++) {
+			int j;
+
+#if  WAVE_PROPAGATION_SOLVER==4 and defined VECTORIZE
+			// Vectorization is currently only possible for the FWaveVec solver
+			// Vectorize the inner loop
+#pragma simd
+#endif // WAVE_PROPAGATION_SOLVER==4 and defined VECTORIZE
+			for (j = 2; j < end_ny_1_1; ++j) {
+				float maxEdgeSpeed;
+
+				wavePropagationSolver.computeNetUpdates (
+					h[i - 1][j], h[i][j],
+					hu[i - 1][j], hu[i][j],
+					b[i - 1][j], b[i][j],
+					hNetUpdatesLeft[i - 1][j - 1], hNetUpdatesRight[i - 1][j - 1],
+					huNetUpdatesLeft[i - 1][j - 1], huNetUpdatesRight[i - 1][j - 1],
+					maxEdgeSpeed
+				);
+
+#ifdef LOOP_OPENMP
+				//update the thread-local maximum wave speed
+				l_maxWaveSpeed = std::max (l_maxWaveSpeed, maxEdgeSpeed);
+#else // LOOP_OPENMP
+				//update the maximum wave speed
+				maxWaveSpeed = std::max (maxWaveSpeed, maxEdgeSpeed);
+#endif // LOOP_OPENMP
+			}
+			assert (j == ny + 1);
+//		}
+
+	/***************************************************************************************
+	 * compute the net-updates for the horizontal edges
+	 **************************************************************************************/
+
+#if  WAVE_PROPAGATION_SOLVER==4 and defined VECTORIZE
+		// Vectorization is currently only possible for the FWaveVec solver
+		// Vectorize the inner loop
+#pragma simd
+#endif // WAVE_PROPAGATION_SOLVER==4
+			for (j = 2; j < end_ny_1_2; j++) {
+				float maxEdgeSpeed;
+
+				wavePropagationSolver.computeNetUpdates (
+					h[i][j - 1], h[i][j],
+					hv[i][j - 1], hv[i][j],
+					b[i][j - 1], b[i][j],
+					hNetUpdatesBelow[i - 1][j - 1], hNetUpdatesAbove[i - 1][j - 1],
+					hvNetUpdatesBelow[i - 1][j - 1], hvNetUpdatesAbove[i - 1][j - 1],
+					maxEdgeSpeed
+				);
+
+#ifdef LOOP_OPENMP
+				//update the thread-local maximum wave speed
+				l_maxWaveSpeed = std::max (l_maxWaveSpeed, maxEdgeSpeed);
+#else // LOOP_OPENMP
+				//update the maximum wave speed
+				maxWaveSpeed = std::max (maxWaveSpeed, maxEdgeSpeed);
+#endif // LOOP_OPENMP
+			}
+			assert (j = ny + 2);
+		}
+
+// vvvvv  GKUKREJA : Fusing the loops for horizontal and vertical edges, adding this one extra iteration to avoid if looping.
+
+		{
+			int i = nx + 1;
+            int j = 1;
+
+#if  WAVE_PROPAGATION_SOLVER==4 and defined VECTORIZE
+            // Vectorization is currently only possible for the FWaveVec solver
+            // Vectorize the inner loop
+#pragma simd
+#endif // WAVE_PROPAGATION_SOLVER==4 and defined VECTORIZE
+#ifdef LOOP_OPENMP
+			#pragma omp for schedule(static) nowait
+#endif
+            for (j = 1; j < end_ny_1_1; ++j) {
+                float maxEdgeSpeed;
+
+                wavePropagationSolver.computeNetUpdates (
+                        h[i - 1][j], h[i][j],
+                        hu[i - 1][j], hu[i][j],
+                        b[i - 1][j], b[i][j],
+                        hNetUpdatesLeft[i - 1][j - 1], hNetUpdatesRight[i - 1][j - 1],
+                        huNetUpdatesLeft[i - 1][j - 1], huNetUpdatesRight[i - 1][j - 1],
+                        maxEdgeSpeed
+                );
+
+#ifdef LOOP_OPENMP
+                //update the thread-local maximum wave speed
+                l_maxWaveSpeed = std::max (l_maxWaveSpeed, maxEdgeSpeed);
+#else // LOOP_OPENMP
+                //update the maximum wave speed
+                maxWaveSpeed = std::max (maxWaveSpeed, maxEdgeSpeed);
+#endif // LOOP_OPENMP
+            }
+            assert (j == ny + 1);
+		}
+
+// ^^^^^ GKUKREJA : Fusing the loops for horizontal and vertical edges, adding this one extra iteration to avoid if looping.
+
+#ifdef LOOP_OPENMP
+#pragma omp critical
+		{
+			maxWaveSpeed = std::max (l_maxWaveSpeed, maxWaveSpeed);
+#ifdef COUNTFLOPS
+			flops += wavePropagationSolver.flops;
+#endif
+		}
+
+	} // #pragma omp parallel
+#endif
+
+	if (maxWaveSpeed > 0.00001) {
+		//TODO zeroTol
+
+		//compute the time step width
+		//CFL-Codition
+		//(max. wave speed) * dt / dx < .5
+		// => dt = .5 * dx/(max wave speed)
+
+		maxTimestep = std::min (dx / maxWaveSpeed, dy / maxWaveSpeed);
+
+		maxTimestep *= (float) .4; //CFL-number = .5
+	} else {
+		//might happen in dry cells
+		maxTimestep = std::numeric_limits<float>::max ();
+	}
+#ifdef COUNTFLOPS
+#ifdef LOOP_OPENMP
+	time_needed += omp_get_wtime() - time_begin;
+#else
+	time_needed += clock() - time_begin;
+#endif
+#endif
+}
+
+/**
+ * Compute net updates for the block.
+ * The member variable #maxTimestep will be updated with the 
+ * maximum allowed time step size
+ */
+void
+SWE_WavePropagationBlock::computeNumericalFluxes_borders ()
+{
+#ifdef COUNTFLOPS
+#ifdef LOOP_OPENMP
+	const double time_begin = omp_get_wtime();
+#else
+	const double time_begin = clock();
+#endif
+#endif
+	//maximum (linearized) wave speed within one iteration
+	float maxWaveSpeed = (float) 0.;
+
+	// compute the loop limits
+	const int end_ny_1_1 = ny + 1;
+	const int end_ny_1_2 = ny + 2;
+
+	
+#ifdef LOOP_OPENMP
+#pragma omp parallel
+#endif
+	{
+
+		float l_maxWaveSpeed = (float) 0.;
+
+#if WAVE_PROPAGATION_SOLVER==4
+		solver::FWaveVec<float> wavePropagationSolver;
+#else // WAVE_PROPAGATION_SOLVER==4
+		solver::AugRie_SIMD wavePropagationSolver;
+#endif // WAVE_PROPAGATION_SOLVER==4
+
+/***************************************************************************************
+ * compute the net-updates for the vertical edges
+ **************************************************************************************/
+
+#ifdef LOOP_OPENMP
+		// Use OpenMP for the outer loop
+#pragma omp for schedule(static) nowait
+#endif // LOOP_OPENMP
+
+#if  WAVE_PROPAGATION_SOLVER==4 and defined VECTORIZE
+		// Vectorization is currently only possible for the FWaveVec solver
+		// Vectorize the inner loop
+#pragma simd
+#endif // WAVE_PROPAGATION_SOLVER==4 and defined VECTORIZE
+		// Compute vertical edges, for the top and bottom border
+		for (j = 1; j < end_ny_1_1; ++j) {
+			float maxEdgeSpeed;
+
+			wavePropagationSolver.computeNetUpdates (
+				h[0][j], h[1][j],
+				hu[0][j], hu[1][j],
+				b[0][j], b[1][j],
+				hNetUpdatesLeft[0][j - 1], hNetUpdatesRight[0][j - 1],
+				huNetUpdatesLeft[0][j - 1], huNetUpdatesRight[0][j - 1],
+				maxEdgeSpeed
+			);
+
+			wavePropagationSolver.computeNetUpdates (
+				h[nx][j], h[nx + 1][j],
+				hu[nx][j], hu[nx + 1][j],
+				b[nx][j], b[nx + 1][j],
+				hNetUpdatesLeft[nx][j - 1], hNetUpdatesRight[nx][j - 1],
+				huNetUpdatesLeft[nx][j - 1], huNetUpdatesRight[nx][j - 1],
+				maxEdgeSpeed
+			);
+
+#ifdef LOOP_OPENMP
+			//update the thread-local maximum wave speed
+			l_maxWaveSpeed = std::max (l_maxWaveSpeed, maxEdgeSpeed);
+#else // LOOP_OPENMP
+			//update the maximum wave speed
+			maxWaveSpeed = std::max (maxWaveSpeed, maxEdgeSpeed);
+#endif // LOOP_OPENMP
+		}
+		// assert (j == ny + 1);
+
+#ifdef LOOP_OPENMP
+		// Use OpenMP for the outer loop
+#pragma omp for schedule(static) nowait
+#endif // LOOP_OPENMP
+
+#if  WAVE_PROPAGATION_SOLVER==4 and defined VECTORIZE
+		// Vectorization is currently only possible for the FWaveVec solver
+		// Vectorize the inner loop
+#pragma simd
+#endif // WAVE_PROPAGATION_SOLVER==4 and defined VECTORIZE
+		// Compute vertical edges, for the left and right border
+		for(int i = 1; i < nx + 2; i++)
+		{
+            wavePropagationSolver.computeNetUpdates (
+                h[i - 1][1], h[i][1],
+                hu[i - 1][1], hu[i][1],
+                b[i - 1][1], b[i][1],
+                hNetUpdatesLeft[i - 1][0], hNetUpdatesRight[i - 1][0],
+                huNetUpdatesLeft[i - 1][0], huNetUpdatesRight[i - 1][0],
+                maxEdgeSpeed
+            );
+
+            wavePropagationSolver.computeNetUpdates (
+                h[i - 1][end_ny_1_1 - 1], h[i][end_ny_1_1 - 1],
+                hu[i - 1][end_ny_1_1 - 1], hu[i][end_ny_1_1 - 1],
+                b[i - 1][end_ny_1_1 - 1], b[i][end_ny_1_1 - 1],
+                hNetUpdatesLeft[i - 1][end_ny_1_1 - 2], hNetUpdatesRight[i - 1][end_ny_1_1 - 2],
+                huNetUpdatesLeft[i - 1][end_ny_1_1 - 2], huNetUpdatesRight[i - 1][end_ny_1_1 - 2],
+                maxEdgeSpeed
+            );
+
+
+#ifdef LOOP_OPENMP
+			//update the thread-local maximum wave speed
+			l_maxWaveSpeed = std::max (l_maxWaveSpeed, maxEdgeSpeed);
+#else // LOOP_OPENMP
+			//update the maximum wave speed
+			maxWaveSpeed = std::max (maxWaveSpeed, maxEdgeSpeed);
+#endif // LOOP_OPENMP
+		}
+
+
+	/***************************************************************************************
+	 * compute the net-updates for the horizontal edges
+	 **************************************************************************************/
+#ifdef LOOP_OPENMP
+		// Use OpenMP for the outer loop
+		#pragma omp for schedule(static) nowait
+#endif // LOOP_OPENMP
+
+#if  WAVE_PROPAGATION_SOLVER==4 and defined VECTORIZE
+		// Vectorization is currently only possible for the FWaveVec solver
+		// Vectorize the inner loop
+		#pragma simd
+#endif // WAVE_PROPAGATION_SOLVER==4 and defined VECTORIZE
+	 	// Compute horizontal edges, for the top and bottom border
+		for (j = 1; j < end_ny_1_2; j++) {
+			float maxEdgeSpeed;
+
+			wavePropagationSolver.computeNetUpdates (
+				h[1][j - 1], h[1][j],
+				hv[1][j - 1], hv[1][j],
+				b[1][j - 1], b[1][j],
+				hNetUpdatesBelow[0][j - 1], hNetUpdatesAbove[0][j - 1],
+				hvNetUpdatesBelow[0][j - 1], hvNetUpdatesAbove[0][j - 1],
+				maxEdgeSpeed
+			);
+
+			wavePropagationSolver.computeNetUpdates (
+				h[nx + 1][j - 1], h[nx + 1][j],
+				hv[nx + 1][j - 1], hv[nx + 1][j],
+				b[nx + 1][j - 1], b[nx + 1][j],
+				hNetUpdatesBelow[nx][j - 1], hNetUpdatesAbove[nx][j - 1],
+				hvNetUpdatesBelow[nx][j - 1], hvNetUpdatesAbove[nx][j - 1],
+				maxEdgeSpeed
+			);
+
+#ifdef LOOP_OPENMP
+			//update the thread-local maximum wave speed
+			l_maxWaveSpeed = std::max (l_maxWaveSpeed, maxEdgeSpeed);
+#else // LOOP_OPENMP
+			//update the maximum wave speed
+			maxWaveSpeed = std::max (maxWaveSpeed, maxEdgeSpeed);
+#endif // LOOP_OPENMP
+		}
+
+#ifdef LOOP_OPENMP
+		// Use OpenMP for the outer loop
+		#pragma omp for schedule(static) nowait
+#endif // LOOP_OPENMP
+
+#if  WAVE_PROPAGATION_SOLVER==4 and defined VECTORIZE
+		// Vectorization is currently only possible for the FWaveVec solver
+		// Vectorize the inner loop
+		#pragma simd
+#endif // WAVE_PROPAGATION_SOLVER==4 and defined VECTORIZE
+	 	// Compute horizontal edges, for the top and bottom border
+		for (i = 1; i < nx + 2; i++) {
+			float maxEdgeSpeed;
+
+			wavePropagationSolver.computeNetUpdates (
+				h[i][0], h[i][1],
+				hv[i][0], hv[i][1],
+				b[i][0], b[i][1],
+				hNetUpdatesBelow[i - 1][0], hNetUpdatesAbove[i - 1][0],
+				hvNetUpdatesBelow[i - 1][0], hvNetUpdatesAbove[i - 1][0],
+				maxEdgeSpeed
+			);
+
+			wavePropagationSolver.computeNetUpdates (
+				h[i][end_ny_1_2 - 2], h[i][end_ny_1_2 - 1],
+				hv[i][end_ny_1_2 - 2], hv[i][end_ny_1_2 - 1],
+				b[i][end_ny_1_2 - 2], b[i][end_ny_1_2 - 1],
+				hNetUpdatesBelow[i - 1][end_ny_1_2 - 2], hNetUpdatesAbove[i - 1][end_ny_1_2 - 2],
+				hvNetUpdatesBelow[i - 1][end_ny_1_2 - 2], hvNetUpdatesAbove[i - 1][end_ny_1_2 - 2],
+				maxEdgeSpeed
+			);
+
+#ifdef LOOP_OPENMP
+			//update the thread-local maximum wave speed
+			l_maxWaveSpeed = std::max (l_maxWaveSpeed, maxEdgeSpeed);
+#else // LOOP_OPENMP
+			//update the maximum wave speed
+			maxWaveSpeed = std::max (maxWaveSpeed, maxEdgeSpeed);
+#endif // LOOP_OPENMP
+		}
+		// assert (j = ny + 2);
+
+#ifdef LOOP_OPENMP
+#pragma omp critical
+		{
+			maxWaveSpeed = std::max (l_maxWaveSpeed, maxWaveSpeed);
+#ifdef COUNTFLOPS
+			flops += wavePropagationSolver.flops;
+#endif
+		}
+
+	} // #pragma omp parallel
+#endif
+
+	if (maxWaveSpeed > 0.00001) {
+		//TODO zeroTol
+
+		//compute the time step width
+		//CFL-Codition
+		//(max. wave speed) * dt / dx < .5
+		// => dt = .5 * dx/(max wave speed)
+
+		maxTimestep = std::min (dx / maxWaveSpeed, dy / maxWaveSpeed);
+
+		maxTimestep *= (float) .4; //CFL-number = .5
+	} else {
+		//might happen in dry cells
+		maxTimestep = std::numeric_limits<float>::max ();
+	}
+#ifdef COUNTFLOPS
+#ifdef LOOP_OPENMP
+	time_needed += omp_get_wtime() - time_begin;
+#else
+	time_needed += clock() - time_begin;
+#endif
+#endif
+}
+
 /**
 * Updates the unknowns with the already computed net-updates.
 *

--- a/src/examples/swe_mpi.cpp
+++ b/src/examples/swe_mpi.cpp
@@ -436,15 +436,17 @@ int main( int argc, char** argv ) {

    // do time steps until next checkpoint is reached
    while( l_t < l_checkPoints[c] ) {
+      MPI_Request l_request;
+
      //reset CPU-Communication clock
      tools::Logger::logger.resetClockToCurrentTime("CpuCommunication");

      // exchange ghost and copy layers
-      exchangeLeftRightGhostLayers( l_leftNeighborRank,  l_leftInflow,  l_leftOutflow,
+      l_request = exchangeAsyncLeftRightGhostLayers( l_leftNeighborRank,  l_leftInflow,  l_leftOutflow,
                      l_rightNeighborRank, l_rightInflow, l_rightOutflow,
                      l_mpiCol );

-      exchangeBottomTopGhostLayers( l_bottomNeighborRank, l_bottomInflow, l_bottomOutflow,
+      l_request = exchangeAsyncBottomTopGhostLayers( l_bottomNeighborRank, l_bottomInflow, l_bottomOutflow,
                      l_topNeighborRank,    l_topInflow,    l_topOutflow,
                      l_mpiRow );

@@ -454,8 +456,14 @@ int main( int argc, char** argv ) {
      // set values in ghost cells
      l_wavePropgationBlock.setGhostLayer();

-      // compute numerical flux on each edge
-      l_wavePropgationBlock.computeNumericalFluxes();
+      // compute numerical flux for inner edge
+      l_wavePropgationBlock.computeNumericalFluxes_innerBlock(); // Only Inner Block
+
+      // checking if the borders have been exchanged
+      MPI_Wait(&l_request, &status);
+
+      // compute numerical flux for the blocks
+      l_wavePropgationBlock.computeNumericalFluxes_borders(); // Only Borders 

      //! maximum allowed time step width within a block.
      float l_maxTimeStepWidth = l_wavePropgationBlock.getMaxTimestep();
@@ -555,9 +563,16 @@ void exchangeLeftRightGhostLayers( const int i_leftNeighborRank,  SWE_Block1D* o

  MPI_Status l_status;

+  // int MPI_Sendrecv(void *sendbuf, int sendcount, MPI_Datatype sendtype, 
+  //               int dest, int sendtag,
+  //               void *recvbuf, int recvcount, MPI_Datatype recvtype, 
+  //               int source, int recvtag,
+  //               MPI_Comm comm, MPI_Status *status)
  // send to left, receive from the right:
-  MPI_Sendrecv( i_leftOutflow->h.elemVector(), 1, i_mpiCol, i_leftNeighborRank,  1,
-                o_rightInflow->h.elemVector(), 1, i_mpiCol, i_rightNeighborRank, 1,
+  MPI_Sendrecv( i_leftOutflow->h.elemVector(), 1, i_mpiCol,
+                i_leftNeighborRank,  1,
+                o_rightInflow->h.elemVector(), 1, i_mpiCol,
+                i_rightNeighborRank, 1,
                MPI_COMM_WORLD, &l_status );

  MPI_Sendrecv( i_leftOutflow->hu.elemVector(), 1, i_mpiCol, i_leftNeighborRank,  2,
@@ -583,6 +598,92 @@ void exchangeLeftRightGhostLayers( const int i_leftNeighborRank,  SWE_Block1D* o

 }

+/**
+ * Exchanges the left and right ghost layers with MPI's SendReceive.
+ *
+ * @param i_leftNeighborRank MPI rank of the  left neighbor.
+ * @param o_leftInflow ghost layer, where the left neighbor writes into.
+ * @param i_leftOutflow layer where the left neighbor reads from.
+ * @param i_rightNeighborRank MPI rank of the right neighbor.
+ * @param o_rightInflow ghost layer, where the right neighbor writes into.
+ * @param i_rightOutflow layer, where the right neighbor reads form.
+ * @param i_mpiCol MPI data type for the vertical gost layers.
+ */
+MPI_Request* exchangeAsyncLeftRightGhostLayers( const int i_leftNeighborRank,  SWE_Block1D* o_leftInflow,  SWE_Block1D* i_leftOutflow,
+                                   const int i_rightNeighborRank, SWE_Block1D* o_rightInflow, SWE_Block1D* i_rightOutflow,
+                                   MPI_Datatype i_mpiCol) {
+
+  // MPI_Status l_status;
+  MPI_Request* l_request;
+
+  // int MPI_Sendrecv(void *sendbuf, int sendcount, MPI_Datatype sendtype, 
+  //               int dest, int sendtag,
+  //               void *recvbuf, int recvcount, MPI_Datatype recvtype, 
+  //               int source, int recvtag,
+  //               MPI_Comm comm, MPI_Status *status)
+  // send to left, receive from the right:
+
+  MPI_Isend ( i_leftOutflow->h.elemVector(), 1, i_mpiCol,
+              i_leftNeighborRank, 1, MPI_COMM_WORLD, l_request);
+  MPI_Irecv ( o_rightInflow->h.elemVector(), 1, i_mpiCol,
+              i_rightNeighborRank, 1, MPI_COMM_WORLD, l_request);
+
+  // MPI_Sendrecv( i_leftOutflow->h.elemVector(), 1, i_mpiCol,
+  //               i_leftNeighborRank,  1,
+  //               o_rightInflow->h.elemVector(), 1, i_mpiCol,
+  //               i_rightNeighborRank, 1,
+  //               MPI_COMM_WORLD, &l_status );
+
+  MPI_Isend ( i_leftOutflow->hu.elemVector(), 1, i_mpiCol,
+              i_leftNeighborRank, 2, MPI_COMM_WORLD, l_request);
+  MPI_Irecv ( o_rightInflow->hu.elemVector(), 1, i_mpiCol,
+              i_rightNeighborRank, 2, MPI_COMM_WORLD, l_request);
+
+  // MPI_Sendrecv( i_leftOutflow->hu.elemVector(), 1, i_mpiCol, i_leftNeighborRank,  2,
+  //               o_rightInflow->hu.elemVector(), 1, i_mpiCol, i_rightNeighborRank, 2,
+  //               MPI_COMM_WORLD, &l_status );
+
+  MPI_Isend ( i_leftOutflow->hv.elemVector(), 1, i_mpiCol,
+              i_leftNeighborRank, 3, MPI_COMM_WORLD, l_request);
+  MPI_Irecv ( o_rightInflow->hv.elemVector(), 1, i_mpiCol,
+              i_rightNeighborRank, 3, MPI_COMM_WORLD, l_request);
+
+  // MPI_Sendrecv( i_leftOutflow->hv.elemVector(), 1, i_mpiCol, i_leftNeighborRank,  3,
+  //               o_rightInflow->hv.elemVector(), 1, i_mpiCol, i_rightNeighborRank, 3,
+  //               MPI_COMM_WORLD, &l_status );
+
+  // send to right, receive from the left:
+
+  MPI_Isend ( i_rightOutflow->h.elemVector(), 1, i_mpiCol,
+              i_rightNeighborRank, 4, MPI_COMM_WORLD, l_request);
+  MPI_Irecv ( o_leftInflow->h.elemVector(), 1, i_mpiCol,
+              i_leftNeighborRank, 4, MPI_COMM_WORLD, l_request);
+
+  // MPI_Sendrecv( i_rightOutflow->h.elemVector(), 1, i_mpiCol, i_rightNeighborRank, 4,
+  //               o_leftInflow->h.elemVector(),   1, i_mpiCol, i_leftNeighborRank,  4,
+  //               MPI_COMM_WORLD, &l_status );
+
+  MPI_Isend ( i_rightOutflow->hu.elemVector(), 1, i_mpiCol,
+              i_rightNeighborRank, 5, MPI_COMM_WORLD, l_request);
+  MPI_Irecv ( o_leftInflow->hu.elemVector(), 1, i_mpiCol,
+              i_leftNeighborRank, 5, MPI_COMM_WORLD, l_request);
+
+  // MPI_Sendrecv( i_rightOutflow->hu.elemVector(), 1, i_mpiCol, i_rightNeighborRank, 5,
+  //               o_leftInflow->hu.elemVector(),   1, i_mpiCol, i_leftNeighborRank,  5,
+  //               MPI_COMM_WORLD, &l_status);
+
+  MPI_Isend ( i_rightOutflow->hv.elemVector(), 1, i_mpiCol,
+              i_rightNeighborRank, 6, MPI_COMM_WORLD, l_request);
+  MPI_Irecv ( o_leftInflow->hv.elemVector(), 1, i_mpiCol,
+              i_leftNeighborRank, 6, MPI_COMM_WORLD, l_request);
+
+  // MPI_Sendrecv( i_rightOutflow->hv.elemVector(), 1, i_mpiCol, i_rightNeighborRank, 6,
+  //               o_leftInflow->hv.elemVector(),   1, i_mpiCol, i_leftNeighborRank,  6,
+  //               MPI_COMM_WORLD, &l_status );
+
+  return l_request;
+}
+
 /**
 * Exchanges the bottom and top ghost layers with MPI's SendReceive.
 *
@@ -624,5 +725,82 @@ void exchangeBottomTopGhostLayers( const int i_bottomNeighborRank, SWE_Block1D*
  MPI_Sendrecv( i_topNeighborOutflow->hv.elemVector(),   1, i_mpiRow, i_topNeighborRank,    16,
                o_bottomNeighborInflow->hv.elemVector(), 1, i_mpiRow, i_bottomNeighborRank, 16,
                MPI_COMM_WORLD, &l_status );
+}
+
+/**
+ * Exchanges the bottom and top ghost layers with MPI's SendReceive.
+ *
+ * @param i_bottomNeighborRank MPI rank of the bottom neighbor.
+ * @param o_bottomNeighborInflow ghost layer, where the bottom neighbor writes into.
+ * @param i_bottomNeighborOutflow host layer, where the bottom neighbor reads from.
+ * @param i_topNeighborRank MPI rank of the top neighbor.
+ * @param o_topNeighborInflow ghost layer, where the top neighbor writes into.
+ * @param i_topNeighborOutflow ghost layer, where the top neighbor reads from.
+ * @param i_mpiRow MPI data type for the horizontal ghost layers.
+ */
+MPI_Request* exchangeAsyncBottomTopGhostLayers( const int i_bottomNeighborRank, SWE_Block1D* o_bottomNeighborInflow, SWE_Block1D* i_bottomNeighborOutflow,
+                                   const int i_topNeighborRank,    SWE_Block1D* o_topNeighborInflow,    SWE_Block1D* i_topNeighborOutflow,
+                                   const MPI_Datatype i_mpiRow) {

+  // MPI_Status l_status;
+  MPI_Request* l_request;
+  l_request = (MPI_Request*) malloc(sizeof(MPI_Request));
+
+  // send to bottom, receive from the top:
+  MPI_Isend ( i_bottomNeighborOutflow->h.elemVector(), 1, i_mpiRow,
+              i_bottomNeighborRank, 11, MPI_COMM_WORLD, l_request);
+  MPI_Irecv ( o_topNeighborInflow->h.elemVector(), 1, i_mpiRow,
+              i_topNeighborRank, 11, MPI_COMM_WORLD, l_request);
+
+  // MPI_Sendrecv( i_bottomNeighborOutflow->h.elemVector(), 1, i_mpiRow, i_bottomNeighborRank, 11,
+  //               o_topNeighborInflow->h.elemVector(),     1, i_mpiRow, i_topNeighborRank,11,
+  //               MPI_COMM_WORLD, &l_status );
+
+  MPI_Isend ( i_bottomNeighborOutflow->hu.elemVector(), 1, i_mpiRow,
+              i_bottomNeighborRank, 12, MPI_COMM_WORLD, l_request);
+  MPI_Irecv ( o_topNeighborInflow->hu.elemVector(), 1, i_mpiRow,
+              i_topNeighborRank, 12, MPI_COMM_WORLD, l_request);
+
+  // MPI_Sendrecv( i_bottomNeighborOutflow->hu.elemVector(), 1, i_mpiRow, i_bottomNeighborRank, 12,
+  //               o_topNeighborInflow->hu.elemVector(),     1, i_mpiRow, i_topNeighborRank,    12,
+  //               MPI_COMM_WORLD, &l_status );
+
+  MPI_Isend ( i_bottomNeighborOutflow->hv.elemVector(), 1, i_mpiRow,
+              i_bottomNeighborRank, 13, MPI_COMM_WORLD, l_request);
+  MPI_Irecv ( o_topNeighborInflow->hv.elemVector(), 1, i_mpiRow,
+              i_topNeighborRank, 13, MPI_COMM_WORLD, l_request);
+
+  // MPI_Sendrecv( i_bottomNeighborOutflow->hv.elemVector(), 1, i_mpiRow, i_bottomNeighborRank, 13,
+  //               o_topNeighborInflow->hv.elemVector(),     1, i_mpiRow, i_topNeighborRank, 13,
+  //               MPI_COMM_WORLD, &l_status);
+
+  // send to top, receive from the bottom:
+  MPI_Isend ( i_topNeighborOutflow->h.elemVector(), 1, i_mpiRow,
+              i_topNeighborRank, 14, MPI_COMM_WORLD, l_request);
+  MPI_Irecv ( o_bottomNeighborInflow->h.elemVector(), 1, i_mpiRow,
+              i_bottomNeighborRank, 14, MPI_COMM_WORLD, l_request);
+
+  // MPI_Sendrecv( i_topNeighborOutflow->h.elemVector(),   1, i_mpiRow, i_topNeighborRank,    14,
+  //               o_bottomNeighborInflow->h.elemVector(), 1, i_mpiRow, i_bottomNeighborRank, 14,
+  //               MPI_COMM_WORLD, &l_status );
+
+  MPI_Isend ( i_topNeighborOutflow->hu.elemVector(), 1, i_mpiRow,
+              i_topNeighborRank, 15, MPI_COMM_WORLD, l_request);
+  MPI_Irecv ( o_bottomNeighborInflow->hu.elemVector(), 1, i_mpiRow,
+              i_bottomNeighborRank, 15, MPI_COMM_WORLD, l_request);
+
+  // MPI_Sendrecv( i_topNeighborOutflow->hu.elemVector(),   1, i_mpiRow, i_topNeighborRank, 15,
+  //               o_bottomNeighborInflow->hu.elemVector(), 1, i_mpiRow, i_bottomNeighborRank, 15,
+  //               MPI_COMM_WORLD, &l_status );
+
+  MPI_Isend ( i_topNeighborOutflow->hv.elemVector(), 1, i_mpiRow,
+              i_topNeighborRank, 16, MPI_COMM_WORLD, l_request);
+  MPI_Irecv ( o_bottomNeighborInflow->hv.elemVector(), 1, i_mpiRow,
+              i_bottomNeighborRank, 16, MPI_COMM_WORLD, l_request);
+
+  // MPI_Sendrecv( i_topNeighborOutflow->hv.elemVector(),   1, i_mpiRow, i_topNeighborRank,    16,
+  //               o_bottomNeighborInflow->hv.elemVector(), 1, i_mpiRow, i_bottomNeighborRank, 16,
+  //               MPI_COMM_WORLD, &l_status );
+
+  return l_request;
 }