From 83dfade37a746043dfc8d38f57514706d8505352 Mon Sep 17 00:00:00 2001
From: Daniel Genrich <daniel.genrich@gmx.net>
Date: Mon, 25 Jan 2010 15:10:14 +0000
Subject: =?UTF-8?q?Smoke:=20The=20well=20known=20Miika=20H=C3=A4m=C3=A4l?=
 =?UTF-8?q?=C3=A4inen=20(aka=20MiikaH)=20patch=20(http://blenderartists.or?=
 =?UTF-8?q?g/forum/showthread.php=3Ft=3D158317&page=3D42)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Better (and windows enabled) OpenMP handling (> 2x-5x speed)
* More Volumetric Texture mapping options (heat, etc) <-- Matt if that's not to your liking, just revert that part, it's separate anyway
* Initial velocity taken from particle settings (no more slow starting)
* Option to select compression method (there seem to be a bug in my high compression usage, at least it's been reported to result in exploding smoke - better use low compression for the time being)

It's been tested since a while but as usual please report any (new!) bugs. ;-)
---
 intern/smoke/intern/EIGENVALUE_HELPER.cpp | 885 ++++++++++++++++++++++++++++++
 intern/smoke/intern/EIGENVALUE_HELPER.h   |  69 ++-
 intern/smoke/intern/FLUID_3D.cpp          | 761 +++++++++++++++++++------
 intern/smoke/intern/FLUID_3D.h            |  85 ++-
 intern/smoke/intern/FLUID_3D_SOLVERS.cpp  | 536 ++++++------------
 intern/smoke/intern/FLUID_3D_STATIC.cpp   | 204 ++++---
 intern/smoke/intern/LU_HELPER.cpp         | 136 +++++
 intern/smoke/intern/LU_HELPER.h           |  45 +-
 intern/smoke/intern/WTURBULENCE.cpp       | 166 ++++--
 intern/smoke/intern/smoke_API.cpp         |  12 +-
 10 files changed, 2177 insertions(+), 722 deletions(-)
 create mode 100644 intern/smoke/intern/EIGENVALUE_HELPER.cpp
 create mode 100644 intern/smoke/intern/LU_HELPER.cpp

(limited to 'intern/smoke')

diff --git a/intern/smoke/intern/EIGENVALUE_HELPER.cpp b/intern/smoke/intern/EIGENVALUE_HELPER.cpp
new file mode 100644
index 00000000000..f11b5795e33
--- /dev/null
+++ b/intern/smoke/intern/EIGENVALUE_HELPER.cpp
@@ -0,0 +1,885 @@
+
+#include "EIGENVALUE_HELPER.h"
+
+
+void Eigentred2(sEigenvalue& eval) {
+
+   //  This is derived from the Algol procedures tred2 by
+   //  Bowdler, Martin, Reinsch, and Wilkinson, Handbook for
+   //  Auto. Comp., Vol.ii-Linear Algebra, and the corresponding
+   //  Fortran subroutine in EISPACK.
+
+	  int n=eval.n;
+
+      for (int j = 0; j < n; j++) {
+         eval.d[j] = eval.V[n-1][j];
+      }
+
+      // Householder reduction to tridiagonal form.
+   
+      for (int i = n-1; i > 0; i--) {
+   
+         // Scale to avoid under/overflow.
+   
+         float scale = 0.0;
+         float h = 0.0;
+         for (int k = 0; k < i; k++) {
+            scale = scale + fabs(eval.d[k]);
+         }
+         if (scale == 0.0) {
+            eval.e[i] = eval.d[i-1];
+            for (int j = 0; j < i; j++) {
+               eval.d[j] = eval.V[i-1][j];
+               eval.V[i][j] = 0.0;
+               eval.V[j][i] = 0.0;
+            }
+         } else {
+   
+            // Generate Householder vector.
+   
+            for (int k = 0; k < i; k++) {
+               eval.d[k] /= scale;
+               h += eval.d[k] * eval.d[k];
+            }
+            float f = eval.d[i-1];
+            float g = sqrt(h);
+            if (f > 0) {
+               g = -g;
+            }
+            eval.e[i] = scale * g;
+            h = h - f * g;
+            eval.d[i-1] = f - g;
+            for (int j = 0; j < i; j++) {
+               eval.e[j] = 0.0;
+            }
+   
+            // Apply similarity transformation to remaining columns.
+   
+            for (int j = 0; j < i; j++) {
+               f = eval.d[j];
+               eval.V[j][i] = f;
+               g = eval.e[j] + eval.V[j][j] * f;
+               for (int k = j+1; k <= i-1; k++) {
+                  g += eval.V[k][j] * eval.d[k];
+                  eval.e[k] += eval.V[k][j] * f;
+               }
+               eval.e[j] = g;
+            }
+            f = 0.0;
+            for (int j = 0; j < i; j++) {
+               eval.e[j] /= h;
+               f += eval.e[j] * eval.d[j];
+            }
+            float hh = f / (h + h);
+            for (int j = 0; j < i; j++) {
+               eval.e[j] -= hh * eval.d[j];
+            }
+            for (int j = 0; j < i; j++) {
+               f = eval.d[j];
+               g = eval.e[j];
+               for (int k = j; k <= i-1; k++) {
+                  eval.V[k][j] -= (f * eval.e[k] + g * eval.d[k]);
+               }
+               eval.d[j] = eval.V[i-1][j];
+               eval.V[i][j] = 0.0;
+            }
+         }
+         eval.d[i] = h;
+      }
+   
+      // Accumulate transformations.
+   
+      for (int i = 0; i < n-1; i++) {
+         eval.V[n-1][i] = eval.V[i][i];
+         eval.V[i][i] = 1.0;
+         float h = eval.d[i+1];
+         if (h != 0.0) {
+            for (int k = 0; k <= i; k++) {
+               eval.d[k] = eval.V[k][i+1] / h;
+            }
+            for (int j = 0; j <= i; j++) {
+               float g = 0.0;
+               for (int k = 0; k <= i; k++) {
+                  g += eval.V[k][i+1] * eval.V[k][j];
+               }
+               for (int k = 0; k <= i; k++) {
+                  eval.V[k][j] -= g * eval.d[k];
+               }
+            }
+         }
+         for (int k = 0; k <= i; k++) {
+            eval.V[k][i+1] = 0.0;
+         }
+      }
+      for (int j = 0; j < n; j++) {
+         eval.d[j] = eval.V[n-1][j];
+         eval.V[n-1][j] = 0.0;
+      }
+      eval.V[n-1][n-1] = 1.0;
+      eval.e[0] = 0.0;
+}
+
+void Eigencdiv(sEigenvalue& eval, float xr, float xi, float yr, float yi) {
+      float r,d;
+      if (fabs(yr) > fabs(yi)) {
+         r = yi/yr;
+         d = yr + r*yi;
+         eval.cdivr = (xr + r*xi)/d;
+         eval.cdivi = (xi - r*xr)/d;
+      } else {
+         r = yr/yi;
+         d = yi + r*yr;
+         eval.cdivr = (r*xr + xi)/d;
+         eval.cdivi = (r*xi - xr)/d;
+      }
+   }
+
+void Eigentql2 (sEigenvalue& eval) {
+
+   //  This is derived from the Algol procedures tql2, by
+   //  Bowdler, Martin, Reinsch, and Wilkinson, Handbook for
+   //  Auto. Comp., Vol.ii-Linear Algebra, and the corresponding
+   //  Fortran subroutine in EISPACK.
+   
+	  int n=eval.n;
+
+      for (int i = 1; i < n; i++) {
+         eval.e[i-1] = eval.e[i];
+      }
+      eval.e[n-1] = 0.0;
+   
+      float f = 0.0;
+      float tst1 = 0.0;
+      float eps = pow(2.0,-52.0);
+      for (int l = 0; l < n; l++) {
+
+         // Find small subdiagonal element
+   
+         tst1 = max(tst1,fabs(eval.d[l]) + fabs(eval.e[l]));
+         int m = l;
+
+        // Original while-loop from Java code
+         while (m < n) {
+            if (fabs(eval.e[m]) <= eps*tst1) {
+               break;
+            }
+            m++;
+         }
+
+   
+         // If m == l, d[l] is an eigenvalue,
+         // otherwise, iterate.
+   
+         if (m > l) {
+            int iter = 0;
+            do {
+               iter = iter + 1;  // (Could check iteration count here.)
+   
+               // Compute implicit shift
+
+               float g = eval.d[l];
+               float p = (eval.d[l+1] - g) / (2.0 * eval.e[l]);
+               float r = hypot(p,1.0);
+               if (p < 0) {
+                  r = -r;
+               }
+               eval.d[l] = eval.e[l] / (p + r);
+               eval.d[l+1] = eval.e[l] * (p + r);
+               float dl1 = eval.d[l+1];
+               float h = g - eval.d[l];
+               for (int i = l+2; i < n; i++) {
+                  eval.d[i] -= h;
+               }
+               f = f + h;
+   
+               // Implicit QL transformation.
+   
+               p = eval.d[m];
+               float c = 1.0;
+               float c2 = c;
+               float c3 = c;
+               float el1 = eval.e[l+1];
+               float s = 0.0;
+               float s2 = 0.0;
+               for (int i = m-1; i >= l; i--) {
+                  c3 = c2;
+                  c2 = c;
+                  s2 = s;
+                  g = c * eval.e[i];
+                  h = c * p;
+                  r = hypot(p,eval.e[i]);
+                  eval.e[i+1] = s * r;
+                  s = eval.e[i] / r;
+                  c = p / r;
+                  p = c * eval.d[i] - s * g;
+                  eval.d[i+1] = h + s * (c * g + s * eval.d[i]);
+   
+                  // Accumulate transformation.
+   
+                  for (int k = 0; k < n; k++) {
+                     h = eval.V[k][i+1];
+                     eval.V[k][i+1] = s * eval.V[k][i] + c * h;
+                     eval.V[k][i] = c * eval.V[k][i] - s * h;
+                  }
+               }
+               p = -s * s2 * c3 * el1 * eval.e[l] / dl1;
+               eval.e[l] = s * p;
+               eval.d[l] = c * p;
+   
+               // Check for convergence.
+   
+            } while (fabs(eval.e[l]) > eps*tst1);
+         }
+         eval.d[l] = eval.d[l] + f;
+         eval.e[l] = 0.0;
+      }
+     
+      // Sort eigenvalues and corresponding vectors.
+   
+      for (int i = 0; i < n-1; i++) {
+         int k = i;
+         float p = eval.d[i];
+         for (int j = i+1; j < n; j++) {
+            if (eval.d[j] < p) {
+               k = j;
+               p = eval.d[j];
+            }
+         }
+         if (k != i) {
+            eval.d[k] = eval.d[i];
+            eval.d[i] = p;
+            for (int j = 0; j < n; j++) {
+               p = eval.V[j][i];
+               eval.V[j][i] = eval.V[j][k];
+               eval.V[j][k] = p;
+            }
+         }
+      }
+}
+
+void Eigenorthes (sEigenvalue& eval) {
+   
+      //  This is derived from the Algol procedures orthes and ortran,
+      //  by Martin and Wilkinson, Handbook for Auto. Comp.,
+      //  Vol.ii-Linear Algebra, and the corresponding
+      //  Fortran subroutines in EISPACK.
+   
+	  int n=eval.n;
+
+      int low = 0;
+      int high = n-1;
+   
+      for (int m = low+1; m <= high-1; m++) {
+   
+         // Scale column.
+   
+         float scale = 0.0;
+         for (int i = m; i <= high; i++) {
+            scale = scale + fabs(eval.H[i][m-1]);
+         }
+         if (scale != 0.0) {
+   
+            // Compute Householder transformation.
+   
+            float h = 0.0;
+            for (int i = high; i >= m; i--) {
+               eval.ort[i] = eval.H[i][m-1]/scale;
+               h += eval.ort[i] * eval.ort[i];
+            }
+            float g = sqrt(h);
+            if (eval.ort[m] > 0) {
+               g = -g;
+            }
+            h = h - eval.ort[m] * g;
+            eval.ort[m] = eval.ort[m] - g;
+   
+            // Apply Householder similarity transformation
+            // H = (I-u*u'/h)*H*(I-u*u')/h)
+   
+            for (int j = m; j < n; j++) {
+               float f = 0.0;
+               for (int i = high; i >= m; i--) {
+                  f += eval.ort[i]*eval.H[i][j];
+               }
+               f = f/h;
+               for (int i = m; i <= high; i++) {
+                  eval.H[i][j] -= f*eval.ort[i];
+               }
+           }
+   
+           for (int i = 0; i <= high; i++) {
+               float f = 0.0;
+               for (int j = high; j >= m; j--) {
+                  f += eval.ort[j]*eval.H[i][j];
+               }
+               f = f/h;
+               for (int j = m; j <= high; j++) {
+                  eval.H[i][j] -= f*eval.ort[j];
+               }
+            }
+            eval.ort[m] = scale*eval.ort[m];
+            eval.H[m][m-1] = scale*g;
+         }
+      }
+   
+      // Accumulate transformations (Algol's ortran).
+
+      for (int i = 0; i < n; i++) {
+         for (int j = 0; j < n; j++) {
+            eval.V[i][j] = (i == j ? 1.0 : 0.0);
+         }
+      }
+
+      for (int m = high-1; m >= low+1; m--) {
+         if (eval.H[m][m-1] != 0.0) {
+            for (int i = m+1; i <= high; i++) {
+               eval.ort[i] = eval.H[i][m-1];
+            }
+            for (int j = m; j <= high; j++) {
+               float g = 0.0;
+               for (int i = m; i <= high; i++) {
+                  g += eval.ort[i] * eval.V[i][j];
+               }
+               // Double division avoids possible underflow
+               g = (g / eval.ort[m]) / eval.H[m][m-1];
+               for (int i = m; i <= high; i++) {
+                  eval.V[i][j] += g * eval.ort[i];
+               }
+            }
+         }
+      }
+   }
+
+void Eigenhqr2 (sEigenvalue& eval) {
+   
+      //  This is derived from the Algol procedure hqr2,
+      //  by Martin and Wilkinson, Handbook for Auto. Comp.,
+      //  Vol.ii-Linear Algebra, and the corresponding
+      //  Fortran subroutine in EISPACK.
+   
+      // Initialize
+   
+      int nn = eval.n;
+      int n = nn-1;
+      int low = 0;
+      int high = nn-1;
+      float eps = pow(2.0,-52.0);
+      float exshift = 0.0;
+      float p=0,q=0,r=0,s=0,z=0,t,w,x,y;
+   
+      // Store roots isolated by balanc and compute matrix norm
+   
+      float norm = 0.0;
+      for (int i = 0; i < nn; i++) {
+         if ((i < low) || (i > high)) {
+            eval.d[i] = eval.H[i][i];
+            eval.e[i] = 0.0;
+         }
+         for (int j = max(i-1,0); j < nn; j++) {
+            norm = norm + fabs(eval.H[i][j]);
+         }
+      }
+   
+      // Outer loop over eigenvalue index
+   
+      int iter = 0;
+		int totIter = 0;
+      while (n >= low) {
+
+			// NT limit no. of iterations
+			totIter++;
+			if(totIter>100) {
+				//if(totIter>15) std::cout<<"!!!!iter ABORT !!!!!!! "<<totIter<<"\n"; 
+				// NT hack/fix, return large eigenvalues
+				for (int i = 0; i < nn; i++) {
+					eval.d[i] = 10000.;
+					eval.e[i] = 10000.;
+				}
+				return;
+			}
+   
+         // Look for single small sub-diagonal element
+   
+         int l = n;
+         while (l > low) {
+            s = fabs(eval.H[l-1][l-1]) + fabs(eval.H[l][l]);
+            if (s == 0.0) {
+               s = norm;
+            }
+            if (fabs(eval.H[l][l-1]) < eps * s) {
+               break;
+            }
+            l--;
+         }
+       
+         // Check for convergence
+         // One root found
+   
+         if (l == n) {
+            eval.H[n][n] = eval.H[n][n] + exshift;
+            eval.d[n] = eval.H[n][n];
+            eval.e[n] = 0.0;
+            n--;
+            iter = 0;
+   
+         // Two roots found
+   
+         } else if (l == n-1) {
+            w = eval.H[n][n-1] * eval.H[n-1][n];
+            p = (eval.H[n-1][n-1] - eval.H[n][n]) / 2.0;
+            q = p * p + w;
+            z = sqrt(fabs(q));
+            eval.H[n][n] = eval.H[n][n] + exshift;
+            eval.H[n-1][n-1] = eval.H[n-1][n-1] + exshift;
+            x = eval.H[n][n];
+   
+            // float pair
+   
+            if (q >= 0) {
+               if (p >= 0) {
+                  z = p + z;
+               } else {
+                  z = p - z;
+               }
+               eval.d[n-1] = x + z;
+               eval.d[n] = eval.d[n-1];
+               if (z != 0.0) {
+                  eval.d[n] = x - w / z;
+               }
+               eval.e[n-1] = 0.0;
+               eval.e[n] = 0.0;
+               x = eval.H[n][n-1];
+               s = fabs(x) + fabs(z);
+               p = x / s;
+               q = z / s;
+               r = sqrt(p * p+q * q);
+               p = p / r;
+               q = q / r;
+   
+               // Row modification
+   
+               for (int j = n-1; j < nn; j++) {
+                  z = eval.H[n-1][j];
+                  eval.H[n-1][j] = q * z + p * eval.H[n][j];
+                  eval.H[n][j] = q * eval.H[n][j] - p * z;
+               }
+   
+               // Column modification
+   
+               for (int i = 0; i <= n; i++) {
+                  z = eval.H[i][n-1];
+                  eval.H[i][n-1] = q * z + p * eval.H[i][n];
+                  eval.H[i][n] = q * eval.H[i][n] - p * z;
+               }
+   
+               // Accumulate transformations
+   
+               for (int i = low; i <= high; i++) {
+                  z = eval.V[i][n-1];
+                  eval.V[i][n-1] = q * z + p * eval.V[i][n];
+                  eval.V[i][n] = q * eval.V[i][n] - p * z;
+               }
+   
+            // Complex pair
+   
+            } else {
+               eval.d[n-1] = x + p;
+               eval.d[n] = x + p;
+               eval.e[n-1] = z;
+               eval.e[n] = -z;
+            }
+            n = n - 2;
+            iter = 0;
+   
+         // No convergence yet
+   
+         } else {
+   
+            // Form shift
+   
+            x = eval.H[n][n];
+            y = 0.0;
+            w = 0.0;
+            if (l < n) {
+               y = eval.H[n-1][n-1];
+               w = eval.H[n][n-1] * eval.H[n-1][n];
+            }
+   
+            // Wilkinson's original ad hoc shift
+   
+            if (iter == 10) {
+               exshift += x;
+               for (int i = low; i <= n; i++) {
+                  eval.H[i][i] -= x;
+               }
+               s = fabs(eval.H[n][n-1]) + fabs(eval.H[n-1][n-2]);
+               x = y = 0.75 * s;
+               w = -0.4375 * s * s;
+            }
+
+            // MATLAB's new ad hoc shift
+
+            if (iter == 30) {
+                s = (y - x) / 2.0;
+                s = s * s + w;
+                if (s > 0) {
+                    s = sqrt(s);
+                    if (y < x) {
+                       s = -s;
+                    }
+                    s = x - w / ((y - x) / 2.0 + s);
+                    for (int i = low; i <= n; i++) {
+                       eval.H[i][i] -= s;
+                    }
+                    exshift += s;
+                    x = y = w = 0.964;
+                }
+            }
+   
+            iter = iter + 1;   // (Could check iteration count here.)
+   
+            // Look for two consecutive small sub-diagonal elements
+   
+            int m = n-2;
+            while (m >= l) {
+               z = eval.H[m][m];
+               r = x - z;
+               s = y - z;
+               p = (r * s - w) / eval.H[m+1][m] + eval.H[m][m+1];
+               q = eval.H[m+1][m+1] - z - r - s;
+               r = eval.H[m+2][m+1];
+               s = fabs(p) + fabs(q) + fabs(r);
+               p = p / s;
+               q = q / s;
+               r = r / s;
+               if (m == l) {
+                  break;
+               }
+               if (fabs(eval.H[m][m-1]) * (fabs(q) + fabs(r)) <
+                  eps * (fabs(p) * (fabs(eval.H[m-1][m-1]) + fabs(z) +
+                  fabs(eval.H[m+1][m+1])))) {
+                     break;
+               }
+               m--;
+            }
+   
+            for (int i = m+2; i <= n; i++) {
+               eval.H[i][i-2] = 0.0;
+               if (i > m+2) {
+                  eval.H[i][i-3] = 0.0;
+               }
+            }
+   
+            // Double QR step involving rows l:n and columns m:n
+   
+            for (int k = m; k <= n-1; k++) {
+               int notlast = (k != n-1);
+               if (k != m) {
+                  p = eval.H[k][k-1];
+                  q = eval.H[k+1][k-1];
+                  r = (notlast ? eval.H[k+2][k-1] : 0.0);
+                  x = fabs(p) + fabs(q) + fabs(r);
+                  if (x != 0.0) {
+                     p = p / x;
+                     q = q / x;
+                     r = r / x;
+                  }
+               }
+               if (x == 0.0) {
+                  break;
+               }
+               s = sqrt(p * p + q * q + r * r);
+               if (p < 0) {
+                  s = -s;
+               }
+               if (s != 0) {
+                  if (k != m) {
+                     eval.H[k][k-1] = -s * x;
+                  } else if (l != m) {
+                     eval.H[k][k-1] = -eval.H[k][k-1];
+                  }
+                  p = p + s;
+                  x = p / s;
+                  y = q / s;
+                  z = r / s;
+                  q = q / p;
+                  r = r / p;
+   
+                  // Row modification
+   
+                  for (int j = k; j < nn; j++) {
+                     p = eval.H[k][j] + q * eval.H[k+1][j];
+                     if (notlast) {
+                        p = p + r * eval.H[k+2][j];
+                        eval.H[k+2][j] = eval.H[k+2][j] - p * z;
+                     }
+                     eval.H[k][j] = eval.H[k][j] - p * x;
+                     eval.H[k+1][j] = eval.H[k+1][j] - p * y;
+                  }
+   
+                  // Column modification
+   
+                  for (int i = 0; i <= min(n,k+3); i++) {
+                     p = x * eval.H[i][k] + y * eval.H[i][k+1];
+                     if (notlast) {
+                        p = p + z * eval.H[i][k+2];
+                        eval.H[i][k+2] = eval.H[i][k+2] - p * r;
+                     }
+                     eval.H[i][k] = eval.H[i][k] - p;
+                     eval.H[i][k+1] = eval.H[i][k+1] - p * q;
+                  }
+   
+                  // Accumulate transformations
+   
+                  for (int i = low; i <= high; i++) {
+                     p = x * eval.V[i][k] + y * eval.V[i][k+1];
+                     if (notlast) {
+                        p = p + z * eval.V[i][k+2];
+                        eval.V[i][k+2] = eval.V[i][k+2] - p * r;
+                     }
+                     eval.V[i][k] = eval.V[i][k] - p;
+                     eval.V[i][k+1] = eval.V[i][k+1] - p * q;
+                  }
+               }  // (s != 0)
+            }  // k loop
+         }  // check convergence
+      }  // while (n >= low)
+		//if(totIter>15) std::cout<<"!!!!iter "<<totIter<<"\n";
+      
+      // Backsubstitute to find vectors of upper triangular form
+
+      if (norm == 0.0) {
+         return;
+      }
+   
+      for (n = nn-1; n >= 0; n--) {
+         p = eval.d[n];
+         q = eval.e[n];
+   
+         // float vector
+   
+         if (q == 0) {
+            int l = n;
+            eval.H[n][n] = 1.0;
+            for (int i = n-1; i >= 0; i--) {
+               w = eval.H[i][i] - p;
+               r = 0.0;
+               for (int j = l; j <= n; j++) {
+                  r = r + eval.H[i][j] * eval.H[j][n];
+               }
+               if (eval.e[i] < 0.0) {
+                  z = w;
+                  s = r;
+               } else {
+                  l = i;
+                  if (eval.e[i] == 0.0) {
+                     if (w != 0.0) {
+                        eval.H[i][n] = -r / w;
+                     } else {
+                        eval.H[i][n] = -r / (eps * norm);
+                     }
+   
+                  // Solve real equations
+   
+                  } else {
+                     x = eval.H[i][i+1];
+                     y = eval.H[i+1][i];
+                     q = (eval.d[i] - p) * (eval.d[i] - p) + eval.e[i] * eval.e[i];
+                     t = (x * s - z * r) / q;
+                     eval.H[i][n] = t;
+                     if (fabs(x) > fabs(z)) {
+                        eval.H[i+1][n] = (-r - w * t) / x;
+                     } else {
+                        eval.H[i+1][n] = (-s - y * t) / z;
+                     }
+                  }
+   
+                  // Overflow control
+   
+                  t = fabs(eval.H[i][n]);
+                  if ((eps * t) * t > 1) {
+                     for (int j = i; j <= n; j++) {
+                        eval.H[j][n] = eval.H[j][n] / t;
+                     }
+                  }
+               }
+            }
+   
+         // Complex vector
+   
+         } else if (q < 0) {
+            int l = n-1;
+
+            // Last vector component imaginary so matrix is triangular
+   
+            if (fabs(eval.H[n][n-1]) > fabs(eval.H[n-1][n])) {
+               eval.H[n-1][n-1] = q / eval.H[n][n-1];
+               eval.H[n-1][n] = -(eval.H[n][n] - p) / eval.H[n][n-1];
+            } else {
+               Eigencdiv(eval, 0.0,-eval.H[n-1][n],eval.H[n-1][n-1]-p,q);
+               eval.H[n-1][n-1] = eval.cdivr;
+               eval.H[n-1][n] = eval.cdivi;
+            }
+            eval.H[n][n-1] = 0.0;
+            eval.H[n][n] = 1.0;
+            for (int i = n-2; i >= 0; i--) {
+               float ra,sa,vr,vi;
+               ra = 0.0;
+               sa = 0.0;
+               for (int j = l; j <= n; j++) {
+                  ra = ra + eval.H[i][j] * eval.H[j][n-1];
+                  sa = sa + eval.H[i][j] * eval.H[j][n];
+               }
+               w = eval.H[i][i] - p;
+   
+               if (eval.e[i] < 0.0) {
+                  z = w;
+                  r = ra;
+                  s = sa;
+               } else {
+                  l = i;
+                  if (eval.e[i] == 0) {
+                     Eigencdiv(eval,-ra,-sa,w,q);
+                     eval.H[i][n-1] = eval.cdivr;
+                     eval.H[i][n] = eval.cdivi;
+                  } else {
+   
+                     // Solve complex equations
+   
+                     x = eval.H[i][i+1];
+                     y = eval.H[i+1][i];
+                     vr = (eval.d[i] - p) * (eval.d[i] - p) + eval.e[i] * eval.e[i] - q * q;
+                     vi = (eval.d[i] - p) * 2.0 * q;
+                     if ((vr == 0.0) && (vi == 0.0)) {
+                        vr = eps * norm * (fabs(w) + fabs(q) +
+                        fabs(x) + fabs(y) + fabs(z));
+                     }
+                     Eigencdiv(eval, x*r-z*ra+q*sa,x*s-z*sa-q*ra,vr,vi);
+                     eval.H[i][n-1] = eval.cdivr;
+                     eval.H[i][n] = eval.cdivi;
+                     if (fabs(x) > (fabs(z) + fabs(q))) {
+                        eval.H[i+1][n-1] = (-ra - w * eval.H[i][n-1] + q * eval.H[i][n]) / x;
+                        eval.H[i+1][n] = (-sa - w * eval.H[i][n] - q * eval.H[i][n-1]) / x;
+                     } else {
+                        Eigencdiv(eval, -r-y*eval.H[i][n-1],-s-y*eval.H[i][n],z,q);
+                        eval.H[i+1][n-1] = eval.cdivr;
+                        eval.H[i+1][n] = eval.cdivi;
+                     }
+                  }
+   
+                  // Overflow control
+
+                  t = max(fabs(eval.H[i][n-1]),fabs(eval.H[i][n]));
+                  if ((eps * t) * t > 1) {
+                     for (int j = i; j <= n; j++) {
+                        eval.H[j][n-1] = eval.H[j][n-1] / t;
+                        eval.H[j][n] = eval.H[j][n] / t;
+                     }
+                  }
+               }
+            }
+         }
+      }
+   
+      // Vectors of isolated roots
+   
+      for (int i = 0; i < nn; i++) {
+         if (i < low || i > high) {
+            for (int j = i; j < nn; j++) {
+               eval.V[i][j] = eval.H[i][j];
+            }
+         }
+      }
+   
+      // Back transformation to get eigenvectors of original matrix
+   
+      for (int j = nn-1; j >= low; j--) {
+         for (int i = low; i <= high; i++) {
+            z = 0.0;
+            for (int k = low; k <= min(j,high); k++) {
+               z = z + eval.V[i][k] * eval.H[k][j];
+            }
+            eval.V[i][j] = z;
+         }
+      }
+}
+
+
+
+int computeEigenvalues3x3(
+		float dout[3], 
+		float a[3][3])
+{
+  /*TNT::Array2D<float> A = TNT::Array2D<float>(3,3, &a[0][0]);
+  TNT::Array1D<float> eig = TNT::Array1D<float>(3);
+  TNT::Array1D<float> eigImag = TNT::Array1D<float>(3);
+  JAMA::Eigenvalue<float> jeig = JAMA::Eigenvalue<float>(A);*/
+
+	sEigenvalue jeig;
+
+	// Compute the values
+	{
+		jeig.n = 3;
+		int n=3;
+      //V = Array2D<float>(n,n);
+      //d = Array1D<float>(n);
+      //e = Array1D<float>(n);
+		for (int y=0; y<3; y++)
+		 {
+			 jeig.d[y]=0.0f;
+			 jeig.e[y]=0.0f;
+			 for (int t=0; t<3; t++) jeig.V[y][t]=0.0f;
+		 }
+
+      jeig.issymmetric = 1;
+      for (int j = 0; (j < 3) && jeig.issymmetric; j++) {
+         for (int i = 0; (i < 3) && jeig.issymmetric; i++) {
+            jeig.issymmetric = (a[i][j] == a[j][i]);
+         }
+      }
+
+      if (jeig.issymmetric) {
+         for (int i = 0; i < 3; i++) {
+            for (int j = 0; j < 3; j++) {
+               jeig.V[i][j] = a[i][j];
+            }
+         }
+   
+         // Tridiagonalize.
+         Eigentred2(jeig);
+   
+         // Diagonalize.
+         Eigentql2(jeig);
+
+      } else {
+         //H = TNT::Array2D<float>(n,n);
+	     for (int y=0; y<3; y++)
+		 {
+			 jeig.ort[y]=0.0f;
+			 for (int t=0; t<3; t++) jeig.H[y][t]=0.0f;
+		 }
+         //ort = TNT::Array1D<float>(n);
+         
+         for (int j = 0; j < n; j++) {
+            for (int i = 0; i < n; i++) {
+               jeig.H[i][j] = a[i][j];
+            }
+         }
+   
+         // Reduce to Hessenberg form.
+         Eigenorthes(jeig);
+   
+         // Reduce Hessenberg to real Schur form.
+         Eigenhqr2(jeig);
+      }
+   }
+
+  //jeig.getfloatEigenvalues(eig);
+
+  // complex ones
+  //jeig.getImagEigenvalues(eigImag);
+  dout[0]  = sqrt(jeig.d[0]*jeig.d[0] + jeig.e[0]*jeig.e[0]);
+  dout[1]  = sqrt(jeig.d[1]*jeig.d[1] + jeig.e[1]*jeig.e[1]);
+  dout[2]  = sqrt(jeig.d[2]*jeig.d[2] + jeig.e[2]*jeig.e[2]);
+  return 0;
+}
diff --git a/intern/smoke/intern/EIGENVALUE_HELPER.h b/intern/smoke/intern/EIGENVALUE_HELPER.h
index 6ff61c5ca8e..9c169711c09 100644
--- a/intern/smoke/intern/EIGENVALUE_HELPER.h
+++ b/intern/smoke/intern/EIGENVALUE_HELPER.h
@@ -15,33 +15,60 @@
 // along with Wavelet Turbulence.  If not, see <http://www.gnu.org/licenses/>.
 // 
 // Copyright 2008 Theodore Kim and Nils Thuerey
-// 
+//
+//////////////////////////////////////////////////////////////////////
+// Modified to not require TNT matrix library anymore. It was very slow
+// when being run in parallel. Required TNT JAMA::Eigenvalue libraries were
+// converted into independent functions.
+//		- MiikaH
+//
 //////////////////////////////////////////////////////////////////////
 // Helper function, compute eigenvalues of 3x3 matrix
 //////////////////////////////////////////////////////////////////////
 
-#include "tnt/jama_eig.h"
+#ifndef EIGENVAL_HELPER_H
+#define EIGENVAL_HELPER_H
+
+//#include "tnt/jama_eig.h"
+
+#include <algorithm>
+#include <cmath>
+
+using namespace std;
 
 //////////////////////////////////////////////////////////////////////
 // eigenvalues of 3x3 non-symmetric matrix
 //////////////////////////////////////////////////////////////////////
-int inline computeEigenvalues3x3(
-		float dout[3], 
-		float a[3][3])
+
+
+struct sEigenvalue
 {
-  TNT::Array2D<float> A = TNT::Array2D<float>(3,3, &a[0][0]);
-  TNT::Array1D<float> eig = TNT::Array1D<float>(3);
-  TNT::Array1D<float> eigImag = TNT::Array1D<float>(3);
-  JAMA::Eigenvalue<float> jeig = JAMA::Eigenvalue<float>(A);
-  jeig.getRealEigenvalues(eig);
-
-  // complex ones
-  jeig.getImagEigenvalues(eigImag);
-  dout[0]  = sqrt(eig[0]*eig[0] + eigImag[0]*eigImag[0]);
-  dout[1]  = sqrt(eig[1]*eig[1] + eigImag[1]*eigImag[1]);
-  dout[2]  = sqrt(eig[2]*eig[2] + eigImag[2]*eigImag[2]);
-  return 0;
-}
-
-#undef rfabs 
-#undef ROT
+	int n;
+	int issymmetric;
+	float d[3];         /* real part */
+	float e[3];         /* img part */
+	float V[3][3];		/* Eigenvectors */
+
+	float H[3][3];
+   
+
+    float ort[3];
+
+	float cdivr;
+	float cdivi;
+};
+
+void Eigentred2(sEigenvalue& eval);
+
+void Eigencdiv(sEigenvalue& eval, float xr, float xi, float yr, float yi);
+
+void Eigentql2 (sEigenvalue& eval);
+
+void Eigenorthes (sEigenvalue& eval);
+
+void Eigenhqr2 (sEigenvalue& eval);
+
+int computeEigenvalues3x3(float dout[3], float a[3][3]);
+
+
+#endif
diff --git a/intern/smoke/intern/FLUID_3D.cpp b/intern/smoke/intern/FLUID_3D.cpp
index 729d73bb4f3..25673630fc4 100644
--- a/intern/smoke/intern/FLUID_3D.cpp
+++ b/intern/smoke/intern/FLUID_3D.cpp
@@ -19,6 +19,11 @@
 // FLUID_3D.cpp: implementation of the FLUID_3D class.
 //
 //////////////////////////////////////////////////////////////////////
+// Heavy parallel optimization done. Many of the old functions now
+// take begin and end parameters and process only specified part of the data.
+// Some functions were divided into multiple ones.
+//		- MiikaH
+//////////////////////////////////////////////////////////////////////
 
 #include "FLUID_3D.h"
 #include "IMAGE.h"
@@ -26,6 +31,10 @@
 #include "SPHERE.h"
 #include <zlib.h>
 
+#if PARALLEL==1
+#include <omp.h>
+#endif // PARALLEL 
+
 // boundary conditions of the fluid domain
 #define DOMAIN_BC_FRONT  0 // z
 #define DOMAIN_BC_TOP    1 // y
@@ -90,6 +99,13 @@ FLUID_3D::FLUID_3D(int *res, float *p0, float dt) :
 	_heatOld      = new float[_totalCells];
 	_obstacles    = new unsigned char[_totalCells]; // set 0 at end of step
 
+	// For threaded version:
+	_xVelocityTemp = new float[_totalCells];
+	_yVelocityTemp = new float[_totalCells];
+	_zVelocityTemp = new float[_totalCells];
+	_densityTemp   = new float[_totalCells];
+	_heatTemp      = new float[_totalCells];
+
 	// DG TODO: check if alloc went fine
 
 	for (int x = 0; x < _totalCells; x++)
@@ -167,6 +183,12 @@ FLUID_3D::~FLUID_3D()
 	if (_obstacles) delete[] _obstacles;
     // if (_wTurbulence) delete _wTurbulence;
 
+	if (_xVelocityTemp) delete[] _xVelocityTemp;
+	if (_yVelocityTemp) delete[] _yVelocityTemp;
+	if (_zVelocityTemp) delete[] _zVelocityTemp;
+	if (_densityTemp) delete[] _densityTemp;
+	if (_heatTemp) delete[] _heatTemp;
+
     // printf("deleted fluid\n");
 }
 
@@ -182,108 +204,306 @@ void FLUID_3D::initBlenderRNA(float *alpha, float *beta)
 //////////////////////////////////////////////////////////////////////
 void FLUID_3D::step()
 {
-	// addSmokeTestCase(_density, _res);
-	// addSmokeTestCase(_heat, _res);
-
-	wipeBoundaries();
-
-	// run the solvers
-	addVorticity();
-	addBuoyancy(_heat, _density);
-	addForce();
-	project();
-	diffuseHeat();
-
-	// advect everything
-	advectMacCormack();
-
-	// if(_wTurbulence) {
-	// 	_wTurbulence->stepTurbulenceFull(_dt/_dx,
-	//			_xVelocity, _yVelocity, _zVelocity, _obstacles);
-		// _wTurbulence->stepTurbulenceReadable(_dt/_dx,
-		//  _xVelocity, _yVelocity, _zVelocity, _obstacles);
-	// }
-/*
- // no file output
-  float *src = _density;
-	string prefix = string("./original.preview/density_fullxy_");
-	writeImageSliceXY(src,_res, _res[2]/2, prefix, _totalSteps);
-*/
-	// artificial damping -- this is necessary because we use a
-  // collated grid, and at very coarse grid resolutions, banding
-  // artifacts can occur
-	artificialDamping(_xVelocity);
-	artificialDamping(_yVelocity);
-	artificialDamping(_zVelocity);
-/*
-// no file output
-  string pbrtPrefix = string("./pbrt/density_small_");
-  IMAGE::dumpPBRT(_totalSteps, pbrtPrefix, _density, _res[0],_res[1],_res[2]);
-  */
-	_totalTime += _dt;
-	_totalSteps++;	
 
-	// todo xxx dg: only clear obstacles, not boundaries
-	// memset(_obstacles, 0, sizeof(unsigned char)*_xRes*_yRes*_zRes);
+	int threadval = 1;
+#if PARALLEL==1
+	threadval = omp_get_max_threads();
+#endif
+
+	int stepParts = 1;
+	float partSize = _zRes;
+
+#if PARALLEL==1
+	stepParts = threadval*2;	// Dividing parallelized sections into numOfThreads * 2 sections
+	partSize = (float)_zRes/stepParts;	// Size of one part;
+
+  if (partSize < 4) {stepParts = threadval;					// If the slice gets too low (might actually slow things down, change it to larger
+					partSize = (float)_zRes/stepParts;}
+  if (partSize < 4) {stepParts = (int)(ceil((float)_zRes/4.0f));	// If it's still too low (only possible on future systems with +24 cores), change it to 4
+					partSize = (float)_zRes/stepParts;}
+#else
+	int zBegin=0;
+	int zEnd=_zRes;
+#endif
+
+
+#if PARALLEL==1
+	#pragma omp parallel
+	{
+	#pragma omp for schedule(static,1)
+	for (int i=0; i<stepParts; i++)
+	{
+		int zBegin = (int)((float)i*partSize + 0.5f);
+		int zEnd = (int)((float)(i+1)*partSize + 0.5f);
+#endif
+
+		wipeBoundariesSL(zBegin, zEnd);
+		addVorticity(zBegin, zEnd);
+		addBuoyancy(_heat, _density, zBegin, zEnd);
+		addForce(zBegin, zEnd);
+
+#if PARALLEL==1
+	}	// end of parallel
+	#pragma omp barrier
+
+	#pragma omp single
+	{
+#endif
+	SWAP_POINTERS(_xVelocity, _xVelocityTemp);
+	SWAP_POINTERS(_yVelocity, _yVelocityTemp);
+	SWAP_POINTERS(_zVelocity, _zVelocityTemp);
+#if PARALLEL==1
+	}	// end of single
+
+	#pragma omp barrier
+
+	#pragma omp for
+	for (int i=0; i<2; i++)
+	{
+		if (i==0)
+		{
+#endif
+		project();
+#if PARALLEL==1
+		}
+		else
+		{
+#endif
+		diffuseHeat();
+#if PARALLEL==1
+		}
+	}
+
+	#pragma omp barrier
+
+	#pragma omp single
+	{
+#endif
+	SWAP_POINTERS(_xVelocity, _xVelocityOld);
+	SWAP_POINTERS(_yVelocity, _yVelocityOld);
+	SWAP_POINTERS(_zVelocity, _zVelocityOld);
+	SWAP_POINTERS(_density, _densityOld);
+	SWAP_POINTERS(_heat, _heatOld);
+
+	advectMacCormackBegin(0, _zRes);
+
+#if PARALLEL==1
+	}	// end of single
+
+	#pragma omp barrier
+
+
+	#pragma omp for schedule(static,1)
+	for (int i=0; i<stepParts; i++)
+	{
+
+		int zBegin = (int)((float)i*partSize + 0.5f);
+		int zEnd = (int)((float)(i+1)*partSize + 0.5f);
+#endif
+
+	advectMacCormackEnd1(zBegin, zEnd);
+
+#if PARALLEL==1
+	}	// end of parallel
+
+	#pragma omp barrier
+
+	#pragma omp for schedule(static,1)
+	for (int i=0; i<stepParts; i++)
+	{
+
+		int zBegin = (int)((float)i*partSize + 0.5f);
+		int zEnd = (int)((float)(i+1)*partSize + 0.5f);
+#endif
+
+		advectMacCormackEnd2(zBegin, zEnd);
+
+		artificialDampingSL(zBegin, zEnd);
+
+		// Using forces as temp arrays
+
+#if PARALLEL==1
+	}
+	}
+
+
+
+	for (int i=1; i<stepParts; i++)
+	{
+		int zPos=(int)((float)i*partSize + 0.5f);
+		
+		artificialDampingExactSL(zPos);
+
+	}
+#endif
+
+	SWAP_POINTERS(_xVelocity, _xForce);
+	SWAP_POINTERS(_yVelocity, _yForce);
+	SWAP_POINTERS(_zVelocity, _zForce);
+
+
+
+
+	_totalTime += _dt;
+	_totalSteps++;		
 
-	// wipe forces
-	// for external forces we can't do it at the beginning of this function but at the end
 	for (int i = 0; i < _totalCells; i++)
 	{
 		_xForce[i] = _yForce[i] = _zForce[i] = 0.0f;
 	}
+
 }
 
 //////////////////////////////////////////////////////////////////////
 // helper function to dampen co-located grid artifacts of given arrays in intervals
 // (only needed for velocity, strength (w) depends on testcase...
 //////////////////////////////////////////////////////////////////////
-void FLUID_3D::artificialDamping(float* field) {
+
+
+void FLUID_3D::artificialDampingSL(int zBegin, int zEnd) {
 	const float w = 0.9;
+
+	memmove(_xForce+(_slabSize*zBegin), _xVelocityTemp+(_slabSize*zBegin), sizeof(float)*_slabSize*(zEnd-zBegin));
+	memmove(_yForce+(_slabSize*zBegin), _yVelocityTemp+(_slabSize*zBegin), sizeof(float)*_slabSize*(zEnd-zBegin));
+	memmove(_zForce+(_slabSize*zBegin), _zVelocityTemp+(_slabSize*zBegin), sizeof(float)*_slabSize*(zEnd-zBegin));
+
+
 	if(_totalSteps % 4 == 1) {
-		for (int z = 1; z < _res[2]-1; z++)
+		for (int z = zBegin+1; z < zEnd-1; z++)
 			for (int y = 1; y < _res[1]-1; y++)
 				for (int x = 1+(y+z)%2; x < _res[0]-1; x+=2) {
 					const int index = x + y*_res[0] + z * _slabSize;
-					field[index] = (1-w)*field[index] + 1./6. * w*(
-							field[index+1] + field[index-1] +
-							field[index+_res[0]] + field[index-_res[0]] +
-							field[index+_slabSize] + field[index-_slabSize] );
+					_xForce[index] = (1-w)*_xVelocityTemp[index] + 1./6. * w*(
+							_xVelocityTemp[index+1] + _xVelocityTemp[index-1] +
+							_xVelocityTemp[index+_res[0]] + _xVelocityTemp[index-_res[0]] +
+							_xVelocityTemp[index+_slabSize] + _xVelocityTemp[index-_slabSize] );
+
+					_yForce[index] = (1-w)*_yVelocityTemp[index] + 1./6. * w*(
+							_yVelocityTemp[index+1] + _yVelocityTemp[index-1] +
+							_yVelocityTemp[index+_res[0]] + _yVelocityTemp[index-_res[0]] +
+							_yVelocityTemp[index+_slabSize] + _yVelocityTemp[index-_slabSize] );
+
+					_zForce[index] = (1-w)*_zVelocityTemp[index] + 1./6. * w*(
+							_zVelocityTemp[index+1] + _zVelocityTemp[index-1] +
+							_zVelocityTemp[index+_res[0]] + _zVelocityTemp[index-_res[0]] +
+							_zVelocityTemp[index+_slabSize] + _zVelocityTemp[index-_slabSize] );
 				}
 	}
+
 	if(_totalSteps % 4 == 3) {
-		for (int z = 1; z < _res[2]-1; z++)
+		for (int z = zBegin+1; z < zEnd-1; z++)
 			for (int y = 1; y < _res[1]-1; y++)
 				for (int x = 1+(y+z+1)%2; x < _res[0]-1; x+=2) {
 					const int index = x + y*_res[0] + z * _slabSize;
-					field[index] = (1-w)*field[index] + 1./6. * w*(
-							field[index+1] + field[index-1] +
-							field[index+_res[0]] + field[index-_res[0]] +
-							field[index+_slabSize] + field[index-_slabSize] );
+					_xForce[index] = (1-w)*_xVelocityTemp[index] + 1./6. * w*(
+							_xVelocityTemp[index+1] + _xVelocityTemp[index-1] +
+							_xVelocityTemp[index+_res[0]] + _xVelocityTemp[index-_res[0]] +
+							_xVelocityTemp[index+_slabSize] + _xVelocityTemp[index-_slabSize] );
+
+					_yForce[index] = (1-w)*_yVelocityTemp[index] + 1./6. * w*(
+							_yVelocityTemp[index+1] + _yVelocityTemp[index-1] +
+							_yVelocityTemp[index+_res[0]] + _yVelocityTemp[index-_res[0]] +
+							_yVelocityTemp[index+_slabSize] + _yVelocityTemp[index-_slabSize] );
+
+					_zForce[index] = (1-w)*_zVelocityTemp[index] + 1./6. * w*(
+							_zVelocityTemp[index+1] + _zVelocityTemp[index-1] +
+							_zVelocityTemp[index+_res[0]] + _zVelocityTemp[index-_res[0]] +
+							_zVelocityTemp[index+_slabSize] + _zVelocityTemp[index-_slabSize] );
 				}
+
+	}
+}
+
+
+
+void FLUID_3D::artificialDampingExactSL(int pos) {
+	const float w = 0.9;
+	int index, x,y,z;
+	
+
+	size_t posslab;
+
+	for (z=pos-1; z<=pos; z++)
+	{
+	posslab=z * _slabSize;
+
+	if(_totalSteps % 4 == 1) {
+			for (y = 1; y < _res[1]-1; y++)
+				for (x = 1+(y+z)%2; x < _res[0]-1; x+=2) {
+					index = x + y*_res[0] + posslab;
+					_xForce[index] = (1-w)*_xVelocityTemp[index] + 1./6. * w*(
+							_xVelocityTemp[index+1] + _xVelocityTemp[index-1] +
+							_xVelocityTemp[index+_res[0]] + _xVelocityTemp[index-_res[0]] +
+							_xVelocityTemp[index+_slabSize] + _xVelocityTemp[index-_slabSize] );
+
+					_yForce[index] = (1-w)*_yVelocityTemp[index] + 1./6. * w*(
+							_yVelocityTemp[index+1] + _yVelocityTemp[index-1] +
+							_yVelocityTemp[index+_res[0]] + _yVelocityTemp[index-_res[0]] +
+							_yVelocityTemp[index+_slabSize] + _yVelocityTemp[index-_slabSize] );
+
+					_zForce[index] = (1-w)*_zVelocityTemp[index] + 1./6. * w*(
+							_zVelocityTemp[index+1] + _zVelocityTemp[index-1] +
+							_zVelocityTemp[index+_res[0]] + _zVelocityTemp[index-_res[0]] +
+							_zVelocityTemp[index+_slabSize] + _zVelocityTemp[index-_slabSize] );
+					
+				}
+	}
+
+	if(_totalSteps % 4 == 3) {
+			for (y = 1; y < _res[1]-1; y++)
+				for (x = 1+(y+z+1)%2; x < _res[0]-1; x+=2) {
+					index = x + y*_res[0] + posslab;
+					_xForce[index] = (1-w)*_xVelocityTemp[index] + 1./6. * w*(
+							_xVelocityTemp[index+1] + _xVelocityTemp[index-1] +
+							_xVelocityTemp[index+_res[0]] + _xVelocityTemp[index-_res[0]] +
+							_xVelocityTemp[index+_slabSize] + _xVelocityTemp[index-_slabSize] );
+
+					_yForce[index] = (1-w)*_yVelocityTemp[index] + 1./6. * w*(
+							_yVelocityTemp[index+1] + _yVelocityTemp[index-1] +
+							_yVelocityTemp[index+_res[0]] + _yVelocityTemp[index-_res[0]] +
+							_yVelocityTemp[index+_slabSize] + _yVelocityTemp[index-_slabSize] );
+
+					_zForce[index] = (1-w)*_zVelocityTemp[index] + 1./6. * w*(
+							_zVelocityTemp[index+1] + _zVelocityTemp[index-1] +
+							_zVelocityTemp[index+_res[0]] + _zVelocityTemp[index-_res[0]] +
+							_zVelocityTemp[index+_slabSize] + _zVelocityTemp[index-_slabSize] );
+					
+				}
+
+	}
 	}
 }
 
 //////////////////////////////////////////////////////////////////////
 // copy out the boundary in all directions
 //////////////////////////////////////////////////////////////////////
-void FLUID_3D::copyBorderAll(float* field)
+void FLUID_3D::copyBorderAll(float* field, int zBegin, int zEnd)
 {
-	int index;
+	int index, x, y, z;
+	int zSize = zEnd-zBegin;
+	int _blockTotalCells=_slabSize * zSize;
+
+	if ((zBegin==0))
 	for (int y = 0; y < _yRes; y++)
 		for (int x = 0; x < _xRes; x++)
 		{
 			// front slab
 			index = x + y * _xRes;
 			field[index] = field[index + _slabSize];
+    }
+
+	if (zEnd==_zRes)
+	for (y = 0; y < _yRes; y++)
+		for (x = 0; x < _xRes; x++)
+		{
 
 			// back slab
-			index += _totalCells - _slabSize;
+			index = x + y * _xRes + _blockTotalCells - _slabSize;
 			field[index] = field[index - _slabSize];
     }
 
-	for (int z = 0; z < _zRes; z++)
-		for (int x = 0; x < _xRes; x++)
+	for (z = 0; z < zSize; z++)
+		for (x = 0; x < _xRes; x++)
     {
 			// bottom slab
 			index = x + z * _slabSize;
@@ -294,8 +514,8 @@ void FLUID_3D::copyBorderAll(float* field)
 			field[index] = field[index - _xRes];
     }
 
-	for (int z = 0; z < _zRes; z++)
-		for (int y = 0; y < _yRes; y++)
+	for (z = 0; z < zSize; z++)
+		for (y = 0; y < _yRes; y++)
     {
 			// left slab
 			index = y * _xRes + z * _slabSize;
@@ -310,27 +530,123 @@ void FLUID_3D::copyBorderAll(float* field)
 //////////////////////////////////////////////////////////////////////
 // wipe boundaries of velocity and density
 //////////////////////////////////////////////////////////////////////
-void FLUID_3D::wipeBoundaries()
+void FLUID_3D::wipeBoundaries(int zBegin, int zEnd)
 {
-	setZeroBorder(_xVelocity, _res);
-	setZeroBorder(_yVelocity, _res);
-	setZeroBorder(_zVelocity, _res);
-	setZeroBorder(_density, _res);
+	setZeroBorder(_xVelocity, _res, zBegin, zEnd);
+	setZeroBorder(_yVelocity, _res, zBegin, zEnd);
+	setZeroBorder(_zVelocity, _res, zBegin, zEnd);
+	setZeroBorder(_density, _res, zBegin, zEnd);
 }
 
+void FLUID_3D::wipeBoundariesSL(int zBegin, int zEnd)
+{
+	
+	/////////////////////////////////////
+	// setZeroBorder to all:
+	/////////////////////////////////////
+
+	/////////////////////////////////////
+	// setZeroX
+	/////////////////////////////////////
+
+	const int slabSize = _xRes * _yRes;
+	int index, x,y,z;
+
+	for (z = zBegin; z < zEnd; z++)
+		for (y = 0; y < _yRes; y++)
+		{
+			// left slab
+			index = y * _xRes + z * slabSize;
+			_xVelocity[index] = 0.0f;
+			_yVelocity[index] = 0.0f;
+			_zVelocity[index] = 0.0f;
+			_density[index] = 0.0f;
+
+			// right slab
+			index += _xRes - 1;
+			_xVelocity[index] = 0.0f;
+			_yVelocity[index] = 0.0f;
+			_zVelocity[index] = 0.0f;
+			_density[index] = 0.0f;
+		}
+
+	/////////////////////////////////////
+	// setZeroY
+	/////////////////////////////////////
+
+	for (z = zBegin; z < zEnd; z++)
+		for (x = 0; x < _xRes; x++)
+		{
+			// bottom slab
+			index = x + z * slabSize;
+			_xVelocity[index] = 0.0f;
+			_yVelocity[index] = 0.0f;
+			_zVelocity[index] = 0.0f;
+			_density[index] = 0.0f;
+
+			// top slab
+			index += slabSize - _xRes;
+			_xVelocity[index] = 0.0f;
+			_yVelocity[index] = 0.0f;
+			_zVelocity[index] = 0.0f;
+			_density[index] = 0.0f;
+
+		}
+
+	/////////////////////////////////////
+	// setZeroZ
+	/////////////////////////////////////
+
+
+	const int totalCells = _xRes * _yRes * _zRes;
+
+	index = 0;
+	if ((zBegin == 0))
+	for (y = 0; y < _yRes; y++)
+		for (x = 0; x < _xRes; x++, index++)
+		{
+			// front slab
+			_xVelocity[index] = 0.0f;
+			_yVelocity[index] = 0.0f;
+			_zVelocity[index] = 0.0f;
+			_density[index] = 0.0f;
+    }
+
+	if (zEnd == _zRes)
+	{
+		index=0;
+		int indexx=0;
+		const int cellsslab = totalCells - slabSize;
+
+		for (y = 0; y < _yRes; y++)
+			for (x = 0; x < _xRes; x++, index++)
+			{
+
+				// back slab
+				indexx = index + cellsslab;
+				_xVelocity[indexx] = 0.0f;
+				_yVelocity[indexx] = 0.0f;
+				_zVelocity[indexx] = 0.0f;
+				_density[indexx] = 0.0f;
+			}
+	}
+
+}
 //////////////////////////////////////////////////////////////////////
 // add forces to velocity field
 //////////////////////////////////////////////////////////////////////
-void FLUID_3D::addForce()
+void FLUID_3D::addForce(int zBegin, int zEnd)
 {
-	for (int i = 0; i < _totalCells; i++)
+	int begin=zBegin * _slabSize;
+	int end=begin + (zEnd - zBegin) * _slabSize;
+
+	for (int i = begin; i < end; i++)
 	{
-		_xVelocity[i] += _dt * _xForce[i];
-		_yVelocity[i] += _dt * _yForce[i];
-		_zVelocity[i] += _dt * _zForce[i];
+		_xVelocityTemp[i] = _xVelocity[i] + _dt * _xForce[i];
+		_yVelocityTemp[i] = _yVelocity[i] + _dt * _yForce[i];
+		_zVelocityTemp[i] = _zVelocity[i] + _dt * _zForce[i];
 	}
 }
-
 //////////////////////////////////////////////////////////////////////
 // project into divergence free field
 //////////////////////////////////////////////////////////////////////
@@ -344,18 +660,18 @@ void FLUID_3D::project()
 
 	memset(_pressure, 0, sizeof(float)*_totalCells);
 	memset(_divergence, 0, sizeof(float)*_totalCells);
-
-	setObstacleBoundaries(_pressure);
-
+	
+	setObstacleBoundaries(_pressure, 0, _zRes);
+	
 	// copy out the boundaries
-	if(DOMAIN_BC_LEFT == 0)  setNeumannX(_xVelocity, _res);
-	else setZeroX(_xVelocity, _res); 
+	if(DOMAIN_BC_LEFT == 0)  setNeumannX(_xVelocity, _res, 0, _zRes);
+	else setZeroX(_xVelocity, _res, 0, _zRes); 
 
-	if(DOMAIN_BC_TOP == 0)   setNeumannY(_yVelocity, _res);
-	else setZeroY(_yVelocity, _res); 
+	if(DOMAIN_BC_TOP == 0)   setNeumannY(_yVelocity, _res, 0, _zRes);
+	else setZeroY(_yVelocity, _res, 0, _zRes); 
 
-	if(DOMAIN_BC_FRONT == 0) setNeumannZ(_zVelocity, _res);
-	else setZeroZ(_zVelocity, _res);
+	if(DOMAIN_BC_FRONT == 0) setNeumannZ(_zVelocity, _res, 0, _zRes);
+	else setZeroZ(_zVelocity, _res, 0, _zRes);
 
 	// calculate divergence
 	index = _slabSize + _xRes + 1;
@@ -385,12 +701,12 @@ void FLUID_3D::project()
 				// DG: commenting this helps CG to get a better start, 10-20% speed improvement
 				// _pressure[index] = 0.0f;
 			}
-	copyBorderAll(_pressure);
+	copyBorderAll(_pressure, 0, _zRes);
 
 	// solve Poisson equation
 	solvePressurePre(_pressure, _divergence, _obstacles);
 
-	setObstaclePressure(_pressure);
+	setObstaclePressure(_pressure, 0, _zRes);
 
 	// project out solution
 	float invDx = 1.0f / _dx;
@@ -411,6 +727,9 @@ void FLUID_3D::project()
 	if (_divergence) delete[] _divergence;
 }
 
+
+
+
 //////////////////////////////////////////////////////////////////////
 // diffuse heat
 //////////////////////////////////////////////////////////////////////
@@ -418,13 +737,14 @@ void FLUID_3D::diffuseHeat()
 {
 	SWAP_POINTERS(_heat, _heatOld);
 
-	copyBorderAll(_heatOld);
+	copyBorderAll(_heatOld, 0, _zRes);
 	solveHeat(_heat, _heatOld, _obstacles);
 
 	// zero out inside obstacles
 	for (int x = 0; x < _totalCells; x++)
 		if (_obstacles[x])
 			_heat[x] = 0.0f;
+
 }
 
 //////////////////////////////////////////////////////////////////////
@@ -444,12 +764,28 @@ void FLUID_3D::addObstacle(OBSTACLE* obstacle)
 //////////////////////////////////////////////////////////////////////
 // calculate the obstacle directional types
 //////////////////////////////////////////////////////////////////////
-void FLUID_3D::setObstaclePressure(float *_pressure)
+void FLUID_3D::setObstaclePressure(float *_pressure, int zBegin, int zEnd)
 {
+
+	// compleately TODO
+
+	const size_t index_ = _slabSize + _xRes + 1;
+
+	//int vIndex=_slabSize + _xRes + 1;
+
+	int bb=0;
+	int bt=0;
+
+	if (zBegin == 0) {bb = 1;}
+	if (zEnd == _zRes) {bt = 1;}
+
 	// tag remaining obstacle blocks
-	for (int z = 1, index = _slabSize + _xRes + 1;
-			z < _zRes - 1; z++, index += 2 * _xRes)
+	for (int z = zBegin + bb; z < zEnd - bt; z++)
+	{
+		size_t index = index_ +(z-1)*_slabSize;
+
 		for (int y = 1; y < _yRes - 1; y++, index += 2)
+		{
 			for (int x = 1; x < _xRes - 1; x++, index++)
 		{
 			// could do cascade of ifs, but they are a pain
@@ -507,15 +843,33 @@ void FLUID_3D::setObstaclePressure(float *_pressure)
 				// this means it's not a full no-slip boundary condition
 				// but a "half-slip" - still looks ok right now
 			}
-		}
+			//vIndex++;
+		}	// x loop
+		//vIndex += 2;
+		}	// y loop
+		//vIndex += 2 * _xRes;
+	}	// z loop
 }
 
-void FLUID_3D::setObstacleBoundaries(float *_pressure)
+void FLUID_3D::setObstacleBoundaries(float *_pressure, int zBegin, int zEnd)
 {
 	// cull degenerate obstacles , move to addObstacle?
-	for (int z = 1, index = _slabSize + _xRes + 1;
-			z < _zRes - 1; z++, index += 2 * _xRes)
+
+	// r = b - Ax
+	const size_t index_ = _slabSize + _xRes + 1;
+
+	int bb=0;
+	int bt=0;
+
+	if (zBegin == 0) {bb = 1;}
+	if (zEnd == _zRes) {bt = 1;}
+
+	for (int z = zBegin + bb; z < zEnd - bt; z++)
+	{
+		size_t index = index_ +(z-1)*_slabSize;
+		
 		for (int y = 1; y < _yRes - 1; y++, index += 2)
+		{
 			for (int x = 1; x < _xRes - 1; x++, index++)
 			{
 				if (_obstacles[index] != EMPTY)
@@ -545,17 +899,22 @@ void FLUID_3D::setObstacleBoundaries(float *_pressure)
 					_zVelocity[index] = 0.0f;
 					_pressure[index] = 0.0f;
 				}
-			}
+				//vIndex++;
+			}	// x-loop
+			//vIndex += 2;
+		}	// y-loop
+		//vIndex += 2* _xRes;
+	}	// z-loop
 }
 
 //////////////////////////////////////////////////////////////////////
 // add buoyancy forces
 //////////////////////////////////////////////////////////////////////
-void FLUID_3D::addBuoyancy(float *heat, float *density)
+void FLUID_3D::addBuoyancy(float *heat, float *density, int zBegin, int zEnd)
 {
-	int index = 0;
+	int index = zBegin*_slabSize;
 
-	for (int z = 0; z < _zRes; z++)
+	for (int z = zBegin; z < zEnd; z++)
 		for (int y = 0; y < _yRes; y++)
 			for (int x = 0; x < _xRes; x++, index++)
 			{
@@ -566,30 +925,55 @@ void FLUID_3D::addBuoyancy(float *heat, float *density)
 //////////////////////////////////////////////////////////////////////
 // add vorticity to the force field
 //////////////////////////////////////////////////////////////////////
-void FLUID_3D::addVorticity()
+void FLUID_3D::addVorticity(int zBegin, int zEnd)
 {
-	int x,y,z,index;
+	//int x,y,z,index;
 	if(_vorticityEps<=0.) return;
 
+	int _blockSize=zEnd-zBegin;
+	int _blockTotalCells = _slabSize * (_blockSize+2);
+
 	float *_xVorticity, *_yVorticity, *_zVorticity, *_vorticity;
 
-	_xVorticity = new float[_totalCells];
-	_yVorticity = new float[_totalCells];
-	_zVorticity = new float[_totalCells];
-	_vorticity = new float[_totalCells];
+	int _vIndex = _slabSize + _xRes + 1;
+	int bb=0;
+	int bt=0;
+	int bb1=-1;
+	int bt1=-1;
+
+	if (zBegin == 0) {bb1 = 1; bb = 1; _blockTotalCells-=_blockSize;}
+	if (zEnd == _zRes) {bt1 = 1;bt = 1; _blockTotalCells-=_blockSize;}
+
+	_xVorticity = new float[_blockTotalCells];
+	_yVorticity = new float[_blockTotalCells];
+	_zVorticity = new float[_blockTotalCells];
+	_vorticity = new float[_blockTotalCells];
 
-	memset(_xVorticity, 0, sizeof(float)*_totalCells);
-	memset(_yVorticity, 0, sizeof(float)*_totalCells);
-	memset(_zVorticity, 0, sizeof(float)*_totalCells);
-	memset(_vorticity, 0, sizeof(float)*_totalCells);
+	memset(_xVorticity, 0, sizeof(float)*_blockTotalCells);
+	memset(_yVorticity, 0, sizeof(float)*_blockTotalCells);
+	memset(_zVorticity, 0, sizeof(float)*_blockTotalCells);
+	memset(_vorticity, 0, sizeof(float)*_blockTotalCells);
+
+	//const size_t indexsetupV=_slabSize;
+	const size_t index_ = _slabSize + _xRes + 1;
 
 	// calculate vorticity
 	float gridSize = 0.5f / _dx;
-	index = _slabSize + _xRes + 1;
-	for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
-		for (y = 1; y < _yRes - 1; y++, index += 2)
-			for (x = 1; x < _xRes - 1; x++, index++)
+	//index = _slabSize + _xRes + 1;
+
+
+	size_t vIndex=_xRes + 1;
+
+	for (int z = zBegin + bb1; z < (zEnd - bt1); z++)
+	{
+		size_t index = index_ +(z-1)*_slabSize;
+		vIndex = index-(zBegin-1+bb)*_slabSize;
+
+		for (int y = 1; y < _yRes - 1; y++, index += 2)
+		{
+			for (int x = 1; x < _xRes - 1; x++, index++)
 			{
+
 				int up    = _obstacles[index + _xRes] ? index : index + _xRes;
 				int down  = _obstacles[index - _xRes] ? index : index - _xRes;
 				float dy  = (up == index || down == index) ? 1.0f / _dx : gridSize;
@@ -600,34 +984,51 @@ void FLUID_3D::addVorticity()
 				int left  = _obstacles[index - 1] ? index : index - 1;
 				float dx  = (right == index || right == index) ? 1.0f / _dx : gridSize;
 
-				_xVorticity[index] = (_zVelocity[up] - _zVelocity[down]) * dy + (-_yVelocity[out] + _yVelocity[in]) * dz;
-				_yVorticity[index] = (_xVelocity[out] - _xVelocity[in]) * dz + (-_zVelocity[right] + _zVelocity[left]) * dx;
-				_zVorticity[index] = (_yVelocity[right] - _yVelocity[left]) * dx + (-_xVelocity[up] + _xVelocity[down])* dy;
+				_xVorticity[vIndex] = (_zVelocity[up] - _zVelocity[down]) * dy + (-_yVelocity[out] + _yVelocity[in]) * dz;
+				_yVorticity[vIndex] = (_xVelocity[out] - _xVelocity[in]) * dz + (-_zVelocity[right] + _zVelocity[left]) * dx;
+				_zVorticity[vIndex] = (_yVelocity[right] - _yVelocity[left]) * dx + (-_xVelocity[up] + _xVelocity[down])* dy;
 
-				_vorticity[index] = sqrtf(_xVorticity[index] * _xVorticity[index] +
-						_yVorticity[index] * _yVorticity[index] +
-						_zVorticity[index] * _zVorticity[index]);
+				_vorticity[vIndex] = sqrtf(_xVorticity[vIndex] * _xVorticity[vIndex] +
+						_yVorticity[vIndex] * _yVorticity[vIndex] +
+						_zVorticity[vIndex] * _zVorticity[vIndex]);
+
+				vIndex++;
 			}
+			vIndex+=2;
+		}
+		//vIndex+=2*_xRes;
+	}
 
 	// calculate normalized vorticity vectors
 	float eps = _vorticityEps;
-	index = _slabSize + _xRes + 1;
-	for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
-		for (y = 1; y < _yRes - 1; y++, index += 2)
-			for (x = 1; x < _xRes - 1; x++, index++)
+	
+	//index = _slabSize + _xRes + 1;
+	vIndex=_slabSize + _xRes + 1;
+
+	for (int z = zBegin + bb; z < (zEnd - bt); z++)
+	{
+		size_t index = index_ +(z-1)*_slabSize;
+		vIndex = index-(zBegin-1+bb)*_slabSize;
+
+		for (int y = 1; y < _yRes - 1; y++, index += 2)
+		{
+			for (int x = 1; x < _xRes - 1; x++, index++)
+			{
+				//
+
 				if (!_obstacles[index])
 				{
 					float N[3];
 
-					int up    = _obstacles[index + _xRes] ? index : index + _xRes;
-					int down  = _obstacles[index - _xRes] ? index : index - _xRes;
-					float dy  = (up == index || down == index) ? 1.0f / _dx : gridSize;
-					int out   = _obstacles[index + _slabSize] ? index : index + _slabSize;
-					int in    = _obstacles[index - _slabSize] ? index : index - _slabSize;
-					float dz  = (out == index || in == index) ? 1.0f / _dx : gridSize;
-					int right = _obstacles[index + 1] ? index : index + 1;
-					int left  = _obstacles[index - 1] ? index : index - 1;
-					float dx  = (right == index || right == index) ? 1.0f / _dx : gridSize;
+					int up    = _obstacles[index + _xRes] ? vIndex : vIndex + _xRes;
+					int down  = _obstacles[index - _xRes] ? vIndex : vIndex - _xRes;
+					float dy  = (up == vIndex || down == vIndex) ? 1.0f / _dx : gridSize;
+					int out   = _obstacles[index + _slabSize] ? vIndex : vIndex + _slabSize;
+					int in    = _obstacles[index - _slabSize] ? vIndex : vIndex - _slabSize;
+					float dz  = (out == vIndex || in == vIndex) ? 1.0f / _dx : gridSize;
+					int right = _obstacles[index + 1] ? vIndex : vIndex + 1;
+					int left  = _obstacles[index - 1] ? vIndex : vIndex - 1;
+					float dx  = (right == vIndex || right == vIndex) ? 1.0f / _dx : gridSize;
 					N[0] = (_vorticity[right] - _vorticity[left]) * dx;
 					N[1] = (_vorticity[up] - _vorticity[down]) * dy;
 					N[2] = (_vorticity[out] - _vorticity[in]) * dz;
@@ -640,11 +1041,17 @@ void FLUID_3D::addVorticity()
 						N[1] *= magnitude;
 						N[2] *= magnitude;
 
-						_xForce[index] += (N[1] * _zVorticity[index] - N[2] * _yVorticity[index]) * _dx * eps;
-						_yForce[index] -= (N[0] * _zVorticity[index] - N[2] * _xVorticity[index]) * _dx * eps;
-						_zForce[index] += (N[0] * _yVorticity[index] - N[1] * _xVorticity[index]) * _dx * eps;
+						_xForce[index] += (N[1] * _zVorticity[vIndex] - N[2] * _yVorticity[vIndex]) * _dx * eps;
+						_yForce[index] -= (N[0] * _zVorticity[vIndex] - N[2] * _xVorticity[vIndex]) * _dx * eps;
+						_zForce[index] += (N[0] * _yVorticity[vIndex] - N[1] * _xVorticity[vIndex]) * _dx * eps;
 					}
-				}
+					}	// if
+					vIndex++;
+					}	// x loop
+				vIndex+=2;
+				}		// y loop
+			//vIndex+=2*_xRes;
+		}				// z loop
 				
 	if (_xVorticity) delete[] _xVorticity;
 	if (_yVorticity) delete[] _yVorticity;
@@ -652,54 +1059,80 @@ void FLUID_3D::addVorticity()
 	if (_vorticity) delete[] _vorticity;
 }
 
+
+void FLUID_3D::advectMacCormackBegin(int zBegin, int zEnd)
+{
+	Vec3Int res = Vec3Int(_xRes,_yRes,_zRes);
+
+	if(DOMAIN_BC_LEFT == 0) copyBorderX(_xVelocityOld, res, zBegin, zEnd);
+	else setZeroX(_xVelocityOld, res, zBegin, zEnd);
+
+	if(DOMAIN_BC_TOP == 0) copyBorderY(_yVelocityOld, res, zBegin, zEnd);
+	else setZeroY(_yVelocityOld, res, zBegin, zEnd); 
+
+	if(DOMAIN_BC_FRONT == 0) copyBorderZ(_zVelocityOld, res, zBegin, zEnd);
+	else setZeroZ(_zVelocityOld, res, zBegin, zEnd);
+}
+
 //////////////////////////////////////////////////////////////////////
 // Advect using the MacCormack method from the Selle paper
 //////////////////////////////////////////////////////////////////////
-void FLUID_3D::advectMacCormack()
+void FLUID_3D::advectMacCormackEnd1(int zBegin, int zEnd)
 {
 	Vec3Int res = Vec3Int(_xRes,_yRes,_zRes);
 
-	if(DOMAIN_BC_LEFT == 0) copyBorderX(_xVelocity, res);
-	else setZeroX(_xVelocity, res); 
+	const float dt0 = _dt / _dx;
 
-	if(DOMAIN_BC_TOP == 0) copyBorderY(_yVelocity, res);
-	else setZeroY(_yVelocity, res); 
+	int begin=zBegin * _slabSize;
+	int end=begin + (zEnd - zBegin) * _slabSize;
+	for (int x = begin; x < end; x++)
+		_xForce[x] = 0.0;
 
-	if(DOMAIN_BC_FRONT == 0) copyBorderZ(_zVelocity, res);
-	else setZeroZ(_zVelocity, res);
+	// advectFieldMacCormack1(dt, xVelocity, yVelocity, zVelocity, oldField, newField, res)
 
-	SWAP_POINTERS(_xVelocity, _xVelocityOld);
-	SWAP_POINTERS(_yVelocity, _yVelocityOld);
-	SWAP_POINTERS(_zVelocity, _zVelocityOld);
-	SWAP_POINTERS(_density, _densityOld);
-	SWAP_POINTERS(_heat, _heatOld);
+	advectFieldMacCormack1(dt0, _xVelocityOld, _yVelocityOld, _zVelocityOld, _densityOld, _densityTemp, res, zBegin, zEnd);
+	advectFieldMacCormack1(dt0, _xVelocityOld, _yVelocityOld, _zVelocityOld, _heatOld, _heatTemp, res, zBegin, zEnd);
+	advectFieldMacCormack1(dt0, _xVelocityOld, _yVelocityOld, _zVelocityOld, _xVelocityOld, _xVelocity, res, zBegin, zEnd);
+	advectFieldMacCormack1(dt0, _xVelocityOld, _yVelocityOld, _zVelocityOld, _yVelocityOld, _yVelocity, res, zBegin, zEnd);
+	advectFieldMacCormack1(dt0, _xVelocityOld, _yVelocityOld, _zVelocityOld, _zVelocityOld, _zVelocity, res, zBegin, zEnd);
 
+	// Have to wait untill all the threads are done -> so continuing in step 3
+}
+
+//////////////////////////////////////////////////////////////////////
+// Advect using the MacCormack method from the Selle paper
+//////////////////////////////////////////////////////////////////////
+void FLUID_3D::advectMacCormackEnd2(int zBegin, int zEnd)
+{
 	const float dt0 = _dt / _dx;
-	// use force arrays as temp arrays
-	for (int x = 0; x < _totalCells; x++)
-		_xForce[x] = _yForce[x] = 0.0;
+	Vec3Int res = Vec3Int(_xRes,_yRes,_zRes);
 
+	// use force array as temp arrays
 	float* t1 = _xForce;
-	float* t2 = _yForce;
 
-	advectFieldMacCormack(dt0, _xVelocityOld, _yVelocityOld, _zVelocityOld, _densityOld, _density, t1,t2, res, _obstacles);
-	advectFieldMacCormack(dt0, _xVelocityOld, _yVelocityOld, _zVelocityOld, _heatOld, _heat, t1,t2, res, _obstacles);
-	advectFieldMacCormack(dt0, _xVelocityOld, _yVelocityOld, _zVelocityOld, _xVelocityOld, _xVelocity, t1,t2, res, _obstacles);
-	advectFieldMacCormack(dt0, _xVelocityOld, _yVelocityOld, _zVelocityOld, _yVelocityOld, _yVelocity, t1,t2, res, _obstacles);
-	advectFieldMacCormack(dt0, _xVelocityOld, _yVelocityOld, _zVelocityOld, _zVelocityOld, _zVelocity, t1,t2, res, _obstacles);
+	// advectFieldMacCormack2(dt, xVelocity, yVelocity, zVelocity, oldField, newField, tempfield, temp, res, obstacles)
+
+	advectFieldMacCormack2(dt0, _xVelocityOld, _yVelocityOld, _zVelocityOld, _densityOld, _density, _densityTemp, t1, res, _obstacles, zBegin, zEnd);
+	advectFieldMacCormack2(dt0, _xVelocityOld, _yVelocityOld, _zVelocityOld, _heatOld, _heat, _heatTemp, t1, res, _obstacles, zBegin, zEnd);
+	advectFieldMacCormack2(dt0, _xVelocityOld, _yVelocityOld, _zVelocityOld, _xVelocityOld, _xVelocityTemp, _xVelocity, t1, res, _obstacles, zBegin, zEnd);
+	advectFieldMacCormack2(dt0, _xVelocityOld, _yVelocityOld, _zVelocityOld, _yVelocityOld, _yVelocityTemp, _yVelocity, t1, res, _obstacles, zBegin, zEnd);
+	advectFieldMacCormack2(dt0, _xVelocityOld, _yVelocityOld, _zVelocityOld, _zVelocityOld, _zVelocityTemp, _zVelocity, t1, res, _obstacles, zBegin, zEnd);
+
+	if(DOMAIN_BC_LEFT == 0) copyBorderX(_xVelocityTemp, res, zBegin, zEnd);
+	else setZeroX(_xVelocityTemp, res, zBegin, zEnd);				
 
-	if(DOMAIN_BC_LEFT == 0) copyBorderX(_xVelocity, res);
-	else setZeroX(_xVelocity, res); 
+	if(DOMAIN_BC_TOP == 0) copyBorderY(_yVelocityTemp, res, zBegin, zEnd);
+	else setZeroY(_yVelocityTemp, res, zBegin, zEnd); 
 
-	if(DOMAIN_BC_TOP == 0) copyBorderY(_yVelocity, res);
-	else setZeroY(_yVelocity, res); 
+	if(DOMAIN_BC_FRONT == 0) copyBorderZ(_zVelocityTemp, res, zBegin, zEnd);
+	else setZeroZ(_zVelocityTemp, res, zBegin, zEnd);
 
-	if(DOMAIN_BC_FRONT == 0) copyBorderZ(_zVelocity, res);
-	else setZeroZ(_zVelocity, res);
+	setZeroBorder(_density, res, zBegin, zEnd);
+	setZeroBorder(_heat, res, zBegin, zEnd);
 
-	setZeroBorder(_density, res);
-	setZeroBorder(_heat, res);
 
-  for (int x = 0; x < _totalCells; x++)
-    t1[x] = t2[x] = 0.0;
+	/*int begin=zBegin * _slabSize;
+	int end=begin + (zEnd - zBegin) * _slabSize;
+  for (int x = begin; x < end; x++)
+    _xForce[x] = _yForce[x] = 0.0f;*/
 }
diff --git a/intern/smoke/intern/FLUID_3D.h b/intern/smoke/intern/FLUID_3D.h
index a7be7f58335..ab121ef506e 100644
--- a/intern/smoke/intern/FLUID_3D.h
+++ b/intern/smoke/intern/FLUID_3D.h
@@ -19,6 +19,11 @@
 // FLUID_3D.h: interface for the FLUID_3D class.
 //
 //////////////////////////////////////////////////////////////////////
+// Heavy parallel optimization done. Many of the old functions now
+// take begin and end parameters and process only specified part of the data.
+// Some functions were divided into multiple ones.
+//		- MiikaH
+//////////////////////////////////////////////////////////////////////
 
 #ifndef FLUID_3D_H
 #define FLUID_3D_H
@@ -74,7 +79,8 @@ class FLUID_3D
 		int _totalImgDumps;
 		int _totalVelDumps;
 
-    void artificialDamping(float* field);
+		void artificialDampingSL(int zBegin, int zEnd);
+		void artificialDampingExactSL(int pos);
 
 		// fields
 		float* _density;
@@ -92,6 +98,13 @@ class FLUID_3D
 		float* _zForce;
 		unsigned char*  _obstacles;
 
+		// Required for proper threading:
+		float* _xVelocityTemp;
+		float* _yVelocityTemp;
+		float* _zVelocityTemp;
+		float* _heatTemp;
+		float* _densityTemp;
+
 		// CG fields
 		int _iterations;
 
@@ -107,13 +120,14 @@ class FLUID_3D
 		// WTURBULENCE* _wTurbulence;
 
 		// boundary setting functions
-		void copyBorderAll(float* field);
+		void copyBorderAll(float* field, int zBegin, int zEnd);
 
 		// timestepping functions
-		void wipeBoundaries();
-		void addForce();
-		void addVorticity();
-		void addBuoyancy(float *heat, float *density);
+		void wipeBoundaries(int zBegin, int zEnd);
+		void wipeBoundariesSL(int zBegin, int zEnd);
+		void addForce(int zBegin, int zEnd);
+		void addVorticity(int zBegin, int zEnd);
+		void addBuoyancy(float *heat, float *density, int zBegin, int zEnd);
 
 		// solver stuff
 		void project();
@@ -122,41 +136,58 @@ class FLUID_3D
 		void solvePressurePre(float* field, float* b, unsigned char* skip);
 		void solveHeat(float* field, float* b, unsigned char* skip);
 
+
 		// handle obstacle boundaries
-		void setObstacleBoundaries(float *_pressure);
-		void setObstaclePressure(float *_pressure);
+		void setObstacleBoundaries(float *_pressure, int zBegin, int zEnd);
+		void setObstaclePressure(float *_pressure, int zBegin, int zEnd);
 
 	public:
 		// advection, accessed e.g. by WTURBULENCE class
-		void advectMacCormack();
+		//void advectMacCormack();
+		void advectMacCormackBegin(int zBegin, int zEnd);
+		void advectMacCormackEnd1(int zBegin, int zEnd);
+		void advectMacCormackEnd2(int zBegin, int zEnd);
 
 		// boundary setting functions
-		static void copyBorderX(float* field, Vec3Int res);
-		static void copyBorderY(float* field, Vec3Int res);
-		static void copyBorderZ(float* field, Vec3Int res);
-		static void setNeumannX(float* field, Vec3Int res);
-		static void setNeumannY(float* field, Vec3Int res);
-		static void setNeumannZ(float* field, Vec3Int res);
-		static void setZeroX(float* field, Vec3Int res);
-		static void setZeroY(float* field, Vec3Int res);
-		static void setZeroZ(float* field, Vec3Int res);
-		static void setZeroBorder(float* field, Vec3Int res) {
-			setZeroX(field, res);
-			setZeroY(field, res);
-			setZeroZ(field, res);
+		static void copyBorderX(float* field, Vec3Int res, int zBegin, int zEnd);
+		static void copyBorderY(float* field, Vec3Int res, int zBegin, int zEnd);
+		static void copyBorderZ(float* field, Vec3Int res, int zBegin, int zEnd);
+		static void setNeumannX(float* field, Vec3Int res, int zBegin, int zEnd);
+		static void setNeumannY(float* field, Vec3Int res, int zBegin, int zEnd);
+		static void setNeumannZ(float* field, Vec3Int res, int zBegin, int zEnd);
+		static void setZeroX(float* field, Vec3Int res, int zBegin, int zEnd);
+		static void setZeroY(float* field, Vec3Int res, int zBegin, int zEnd);
+		static void setZeroZ(float* field, Vec3Int res, int zBegin, int zEnd);
+		static void setZeroBorder(float* field, Vec3Int res, int zBegin, int zEnd) {
+			setZeroX(field, res, zBegin, zEnd);
+			setZeroY(field, res, zBegin, zEnd);
+			setZeroZ(field, res, zBegin, zEnd);
 		};
 
+		
+
 		// static advection functions, also used by WTURBULENCE
 		static void advectFieldSemiLagrange(const float dt, const float* velx, const float* vely,  const float* velz,
-				float* oldField, float* newField, Vec3Int res);
-		static void advectFieldMacCormack(const float dt, const float* xVelocity, const float* yVelocity, const float* zVelocity, 
-				float* oldField, float* newField, float* temp1, float* temp2, Vec3Int res, const unsigned char* obstacles);
+				float* oldField, float* newField, Vec3Int res, int zBegin, int zEnd);
+		static void advectFieldMacCormack1(const float dt, const float* xVelocity, const float* yVelocity, const float* zVelocity, 
+				float* oldField, float* tempResult, Vec3Int res, int zBegin, int zEnd);
+		static void advectFieldMacCormack2(const float dt, const float* xVelocity, const float* yVelocity, const float* zVelocity, 
+				float* oldField, float* newField, float* tempResult, float* temp1,Vec3Int res, const unsigned char* obstacles, int zBegin, int zEnd);
+
+
+		// temp ones for testing
+		/*static void advectFieldMacCormack(const float dt, const float* xVelocity, const float* yVelocity, const float* zVelocity, 
+				float* oldField, float* newField, float* temp1, float* temp2, Vec3Int res, const unsigned char* obstacles);*/
+		/*static void advectFieldSemiLagrange2(const float dt, const float* velx, const float* vely,  const float* velz,
+				float* oldField, float* newField, Vec3Int res);*/
 
 		// maccormack helper functions
 		static void clampExtrema(const float dt, const float* xVelocity, const float* yVelocity,  const float* zVelocity,
-				float* oldField, float* newField, Vec3Int res);
+				float* oldField, float* newField, Vec3Int res, int zBegin, int zEnd);
 		static void clampOutsideRays(const float dt, const float* xVelocity, const float* yVelocity,  const float* zVelocity,
-				float* oldField, float* newField, Vec3Int res, const unsigned char* obstacles, const float *oldAdvection);
+				float* oldField, float* newField, Vec3Int res, const unsigned char* obstacles, const float *oldAdvection, int zBegin, int zEnd);
+
+
 
 		// output helper functions
 		// static void writeImageSliceXY(const float *field, Vec3Int res, int slice, string prefix, int picCnt, float scale=1.);
diff --git a/intern/smoke/intern/FLUID_3D_SOLVERS.cpp b/intern/smoke/intern/FLUID_3D_SOLVERS.cpp
index 7d078d86d61..8b961494ce5 100644
--- a/intern/smoke/intern/FLUID_3D_SOLVERS.cpp
+++ b/intern/smoke/intern/FLUID_3D_SOLVERS.cpp
@@ -19,16 +19,25 @@
 // FLUID_3D.cpp: implementation of the FLUID_3D class.
 //
 //////////////////////////////////////////////////////////////////////
+// Both solvers optimized by merging loops and precalculating
+// stuff used in iteration loop.
+//		- MiikaH
+//////////////////////////////////////////////////////////////////////
 
 #include "FLUID_3D.h"
 #include <cstring>
 #define SOLVER_ACCURACY 1e-06
 
-void FLUID_3D::solvePressurePre(float* field, float* b, unsigned char* skip)
+//////////////////////////////////////////////////////////////////////
+// solve the heat equation with CG
+//////////////////////////////////////////////////////////////////////
+void FLUID_3D::solveHeat(float* field, float* b, unsigned char* skip)
 {
 	int x, y, z;
+	const int twoxr = 2 * _xRes;
 	size_t index;
-	float *_q, *_Precond, *_h, *_residual, *_direction;
+	const float heatConst = _dt * _heatDiffusion / (_dx * _dx);
+	float *_q, *_residual, *_direction, *_Acenter;
 
 	// i = 0
 	int i = 0;
@@ -36,246 +45,201 @@ void FLUID_3D::solvePressurePre(float* field, float* b, unsigned char* skip)
 	_residual     = new float[_totalCells]; // set 0
 	_direction    = new float[_totalCells]; // set 0
 	_q            = new float[_totalCells]; // set 0
-	_h			  = new float[_totalCells]; // set 0
-	_Precond	  = new float[_totalCells]; // set 0
+	_Acenter       = new float[_totalCells]; // set 0
 
-	memset(_residual, 0, sizeof(float)*_xRes*_yRes*_zRes);
-	memset(_q, 0, sizeof(float)*_xRes*_yRes*_zRes);
-	memset(_direction, 0, sizeof(float)*_xRes*_yRes*_zRes);
-	memset(_h, 0, sizeof(float)*_xRes*_yRes*_zRes);
-	memset(_Precond, 0, sizeof(float)*_xRes*_yRes*_zRes);
-
-	// r = b - Ax
-	index = _slabSize + _xRes + 1;
-	for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
-		for (y = 1; y < _yRes - 1; y++, index += 2)
-		  for (x = 1; x < _xRes - 1; x++, index++)
-		  {
-			// if the cell is a variable
-			float Acenter = 0.0f;
-			if (!skip[index])
-			{
-			  // set the matrix to the Poisson stencil in order
-			  if (!skip[index + 1]) Acenter += 1.;
-			  if (!skip[index - 1]) Acenter += 1.;
-			  if (!skip[index + _xRes]) Acenter += 1.;
-			  if (!skip[index - _xRes]) Acenter += 1.;
-			  if (!skip[index + _slabSize]) Acenter += 1.;
-			  if (!skip[index - _slabSize]) Acenter += 1.;
-			}
-		    
-			_residual[index] = b[index] - (Acenter * field[index] +  
-			  field[index - 1] * (skip[index - 1] ? 0.0 : -1.0f)+ 
-			  field[index + 1] * (skip[index + 1] ? 0.0 : -1.0f)+
-			  field[index - _xRes] * (skip[index - _xRes] ? 0.0 : -1.0f)+ 
-			  field[index + _xRes] * (skip[index + _xRes] ? 0.0 : -1.0f)+
-			  field[index - _slabSize] * (skip[index - _slabSize] ? 0.0 : -1.0f)+ 
-			  field[index + _slabSize] * (skip[index + _slabSize] ? 0.0 : -1.0f) );
-			_residual[index] = (skip[index]) ? 0.0f : _residual[index];
+  	memset(_residual, 0, sizeof(float)*_totalCells);
+	memset(_q, 0, sizeof(float)*_totalCells);
+	memset(_direction, 0, sizeof(float)*_totalCells);
+	memset(_Acenter, 0, sizeof(float)*_totalCells);
 
-			// P^-1
-			if(Acenter < 1.0)
-				_Precond[index] = 0.0;
-			else
-				_Precond[index] = 1.0 / Acenter;
+	float deltaNew = 0.0f;
 
-			// p = P^-1 * r
-			_direction[index] = _residual[index] * _Precond[index];
-		  }
+  // r = b - Ax
+  index = _slabSize + _xRes + 1;
+  for (z = 1; z < _zRes - 1; z++, index += twoxr)
+    for (y = 1; y < _yRes - 1; y++, index += 2)
+      for (x = 1; x < _xRes - 1; x++, index++)
+      {
+        // if the cell is a variable
+        _Acenter[index] = 1.0f;
+        if (!skip[index])
+        {
+          // set the matrix to the Poisson stencil in order
+          if (!skip[index + 1]) _Acenter[index] += heatConst;
+          if (!skip[index - 1]) _Acenter[index] += heatConst;
+          if (!skip[index + _xRes]) _Acenter[index] += heatConst;
+          if (!skip[index - _xRes]) _Acenter[index] += heatConst;
+          if (!skip[index + _slabSize]) _Acenter[index] += heatConst;
+          if (!skip[index - _slabSize]) _Acenter[index] += heatConst;
+
+		  _residual[index] = b[index] - (_Acenter[index] * field[index] + 
+          field[index - 1] * (skip[index - 1] ? 0.0 : -heatConst) + 
+          field[index + 1] * (skip[index + 1] ? 0.0 : -heatConst) +
+          field[index - _xRes] * (skip[index - _xRes] ? 0.0 : -heatConst) + 
+          field[index + _xRes] * (skip[index + _xRes] ? 0.0 : -heatConst) +
+          field[index - _slabSize] * (skip[index - _slabSize] ? 0.0 : -heatConst) + 
+          field[index + _slabSize] * (skip[index + _slabSize] ? 0.0 : -heatConst));
+        }
+		else
+		{
+        _residual[index] = 0.0f;
+		}
 
-	// deltaNew = transpose(r) * p
-	float deltaNew = 0.0f;
-	index = _slabSize + _xRes + 1;
-	for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
-		for (y = 1; y < _yRes - 1; y++, index += 2)
-		  for (x = 1; x < _xRes - 1; x++, index++)
-			deltaNew += _residual[index] * _direction[index];
+		_direction[index] = _residual[index];
+		deltaNew += _residual[index] * _residual[index];
+      }
 
-	// delta0 = deltaNew
-	// float delta0 = deltaNew;
 
   // While deltaNew > (eps^2) * delta0
   const float eps  = SOLVER_ACCURACY;
-  //while ((i < _iterations) && (deltaNew > eps*delta0))
   float maxR = 2.0f * eps;
-  // while (i < _iterations)
-  while ((i < _iterations) && (maxR > 0.001*eps))
+  while ((i < _iterations) && (maxR > eps))
   {
-    // (s) q = Ad (p)
+    // q = Ad
+	float alpha = 0.0f;
+
     index = _slabSize + _xRes + 1;
-    for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
+    for (z = 1; z < _zRes - 1; z++, index += twoxr)
       for (y = 1; y < _yRes - 1; y++, index += 2)
         for (x = 1; x < _xRes - 1; x++, index++)
         {
           // if the cell is a variable
-          float Acenter = 0.0f;
           if (!skip[index])
           {
-            // set the matrix to the Poisson stencil in order
-            if (!skip[index + 1]) Acenter += 1.;
-            if (!skip[index - 1]) Acenter += 1.;
-            if (!skip[index + _xRes]) Acenter += 1.;
-            if (!skip[index - _xRes]) Acenter += 1.;
-            if (!skip[index + _slabSize]) Acenter += 1.;
-            if (!skip[index - _slabSize]) Acenter += 1.;
+
+			_q[index] = (_Acenter[index] * _direction[index] + 
+            _direction[index - 1] * (skip[index - 1] ? 0.0 : -heatConst) + 
+            _direction[index + 1] * (skip[index + 1] ? 0.0 : -heatConst) +
+            _direction[index - _xRes] * (skip[index - _xRes] ? 0.0 : -heatConst) + 
+            _direction[index + _xRes] * (skip[index + _xRes] ? 0.0 : -heatConst) +
+            _direction[index - _slabSize] * (skip[index - _slabSize] ? 0.0 : -heatConst) + 
+            _direction[index + _slabSize] * (skip[index + _slabSize] ? 0.0 : -heatConst));
           }
-          
-          _q[index] = Acenter * _direction[index] +  
-            _direction[index - 1] * (skip[index - 1] ? 0.0 : -1.0f) + 
-            _direction[index + 1] * (skip[index + 1] ? 0.0 : -1.0f) +
-            _direction[index - _xRes] * (skip[index - _xRes] ? 0.0 : -1.0f) + 
-            _direction[index + _xRes] * (skip[index + _xRes] ? 0.0 : -1.0f)+
-            _direction[index - _slabSize] * (skip[index - _slabSize] ? 0.0 : -1.0f) + 
-            _direction[index + _slabSize] * (skip[index + _slabSize] ? 0.0 : -1.0f);
-          _q[index] = (skip[index]) ? 0.0f : _q[index];
+		  else
+		  {
+          _q[index] = 0.0f;
+		  }
+		  alpha += _direction[index] * _q[index];
         }
 
-    // alpha = deltaNew / (transpose(d) * q)
-    float alpha = 0.0f;
-    index = _slabSize + _xRes + 1;
-    for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
-      for (y = 1; y < _yRes - 1; y++, index += 2)
-        for (x = 1; x < _xRes - 1; x++, index++)
-          alpha += _direction[index] * _q[index];
     if (fabs(alpha) > 0.0f)
       alpha = deltaNew / alpha;
+	
+	float deltaOld = deltaNew;
+	deltaNew = 0.0f;
 
-    // x = x + alpha * d
-    index = _slabSize + _xRes + 1;
-    for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
-      for (y = 1; y < _yRes - 1; y++, index += 2)
-        for (x = 1; x < _xRes - 1; x++, index++)
-          field[index] += alpha * _direction[index];
+	maxR = 0.0f;
 
-    // r = r - alpha * q
-	maxR = 0.0;
     index = _slabSize + _xRes + 1;
-    for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
-      for (y = 1; y < _yRes - 1; y++, index += 2)
-        for (x = 1; x < _xRes - 1; x++, index++)
-        {
-          _residual[index] -= alpha * _q[index];
-		  // maxR = (_residual[index] > maxR) ? _residual[index] : maxR;
-        }
-
-	// if(maxR <= eps)
-	// 	break;
-
-	// h = P^-1 * r
-	 index = _slabSize + _xRes + 1;
-    for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
+    for (z = 1; z < _zRes - 1; z++, index += twoxr)
       for (y = 1; y < _yRes - 1; y++, index += 2)
         for (x = 1; x < _xRes - 1; x++, index++)
 		{
-			_h[index] = _Precond[index] * _residual[index];
-		}
+          field[index] += alpha * _direction[index];
 
-    // deltaOld = deltaNew
-    float deltaOld = deltaNew;
+		  _residual[index] -= alpha * _q[index];
+          maxR = (_residual[index] > maxR) ? _residual[index] : maxR;
 
-    // deltaNew = transpose(r) * h
-    deltaNew = 0.0f;
-    index = _slabSize + _xRes + 1;
-    for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
-      for (y = 1; y < _yRes - 1; y++, index += 2)
-        for (x = 1; x < _xRes - 1; x++, index++)
-		{
-          deltaNew += _residual[index] * _h[index];
-		  maxR = (_residual[index]* _h[index] > maxR) ? _residual[index]* _h[index] : maxR;
+		  deltaNew += _residual[index] * _residual[index];
 		}
 
-    // beta = deltaNew / deltaOld
     float beta = deltaNew / deltaOld;
 
-    // d = h + beta * d
     index = _slabSize + _xRes + 1;
-    for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
+    for (z = 1; z < _zRes - 1; z++, index += twoxr)
       for (y = 1; y < _yRes - 1; y++, index += 2)
         for (x = 1; x < _xRes - 1; x++, index++)
-          _direction[index] = _h[index] + beta * _direction[index];
+         _direction[index] = _residual[index] + beta * _direction[index];
 
-    // i = i + 1
+	
     i++;
   }
-  // cout << i << " iterations converged to " << sqrt(maxR) << endl;
+  // cout << i << " iterations converged to " << maxR << endl;
 
-	if (_h) delete[] _h;
-	if (_Precond) delete[] _Precond;
 	if (_residual) delete[] _residual;
 	if (_direction) delete[] _direction;
 	if (_q)       delete[] _q;
+	if (_Acenter)  delete[] _Acenter;
 }
 
-//////////////////////////////////////////////////////////////////////
-// solve the poisson equation with CG
-//////////////////////////////////////////////////////////////////////
-#if 0
-void FLUID_3D::solvePressure(float* field, float* b, unsigned char* skip)
+
+void FLUID_3D::solvePressurePre(float* field, float* b, unsigned char* skip)
 {
 	int x, y, z;
 	size_t index;
+	float *_q, *_Precond, *_h, *_residual, *_direction;
 
 	// i = 0
 	int i = 0;
 
+	_residual     = new float[_totalCells]; // set 0
+	_direction    = new float[_totalCells]; // set 0
+	_q            = new float[_totalCells]; // set 0
+	_h			  = new float[_totalCells]; // set 0
+	_Precond	  = new float[_totalCells]; // set 0
+
 	memset(_residual, 0, sizeof(float)*_xRes*_yRes*_zRes);
 	memset(_q, 0, sizeof(float)*_xRes*_yRes*_zRes);
 	memset(_direction, 0, sizeof(float)*_xRes*_yRes*_zRes);
+	memset(_h, 0, sizeof(float)*_xRes*_yRes*_zRes);
+	memset(_Precond, 0, sizeof(float)*_xRes*_yRes*_zRes);
 
-  // r = b - Ax
-  index = _slabSize + _xRes + 1;
-  for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
-    for (y = 1; y < _yRes - 1; y++, index += 2)
-      for (x = 1; x < _xRes - 1; x++, index++)
-      {
-        // if the cell is a variable
-        float Acenter = 0.0f;
-        if (!skip[index])
-        {
-          // set the matrix to the Poisson stencil in order
-          if (!skip[index + 1]) Acenter += 1.;
-          if (!skip[index - 1]) Acenter += 1.;
-          if (!skip[index + _xRes]) Acenter += 1.;
-          if (!skip[index - _xRes]) Acenter += 1.;
-          if (!skip[index + _slabSize]) Acenter += 1.;
-          if (!skip[index - _slabSize]) Acenter += 1.;
-        }
-        
-        _residual[index] = b[index] - (Acenter * field[index] +  
-          field[index - 1] * (skip[index - 1] ? 0.0 : -1.0f)+ 
-          field[index + 1] * (skip[index + 1] ? 0.0 : -1.0f)+
-          field[index - _xRes] * (skip[index - _xRes] ? 0.0 : -1.0f)+ 
-          field[index + _xRes] * (skip[index + _xRes] ? 0.0 : -1.0f)+
-          field[index - _slabSize] * (skip[index - _slabSize] ? 0.0 : -1.0f)+ 
-          field[index + _slabSize] * (skip[index + _slabSize] ? 0.0 : -1.0f) );
-        _residual[index] = (skip[index]) ? 0.0f : _residual[index];
-      }
-	
+	float deltaNew = 0.0f;
 
-  // d = r
-  index = _slabSize + _xRes + 1;
-  for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
-    for (y = 1; y < _yRes - 1; y++, index += 2)
-      for (x = 1; x < _xRes - 1; x++, index++)
-        _direction[index] = _residual[index];
+	// r = b - Ax
+	index = _slabSize + _xRes + 1;
+	for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
+		for (y = 1; y < _yRes - 1; y++, index += 2)
+		  for (x = 1; x < _xRes - 1; x++, index++)
+		  {
+			// if the cell is a variable
+			float Acenter = 0.0f;
+			if (!skip[index])
+			{
+			  // set the matrix to the Poisson stencil in order
+			  if (!skip[index + 1]) Acenter += 1.;
+			  if (!skip[index - 1]) Acenter += 1.;
+			  if (!skip[index + _xRes]) Acenter += 1.;
+			  if (!skip[index - _xRes]) Acenter += 1.;
+			  if (!skip[index + _slabSize]) Acenter += 1.;
+			  if (!skip[index - _slabSize]) Acenter += 1.;
 
-  // deltaNew = transpose(r) * r
-  float deltaNew = 0.0f;
-  index = _slabSize + _xRes + 1;
-  for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
-    for (y = 1; y < _yRes - 1; y++, index += 2)
-      for (x = 1; x < _xRes - 1; x++, index++)
-        deltaNew += _residual[index] * _residual[index];
+			  _residual[index] = b[index] - (Acenter * field[index] +  
+			  field[index - 1] * (skip[index - 1] ? 0.0 : -1.0f)+ 
+			  field[index + 1] * (skip[index + 1] ? 0.0 : -1.0f)+
+			  field[index - _xRes] * (skip[index - _xRes] ? 0.0 : -1.0f)+ 
+			  field[index + _xRes] * (skip[index + _xRes] ? 0.0 : -1.0f)+
+			  field[index - _slabSize] * (skip[index - _slabSize] ? 0.0 : -1.0f)+ 
+			  field[index + _slabSize] * (skip[index + _slabSize] ? 0.0 : -1.0f) );
+			}
+			else
+			{
+			_residual[index] = 0.0f;
+			}
+
+			// P^-1
+			if(Acenter < 1.0)
+				_Precond[index] = 0.0;
+			else
+				_Precond[index] = 1.0 / Acenter;
+
+			// p = P^-1 * r
+			_direction[index] = _residual[index] * _Precond[index];
+
+			deltaNew += _residual[index] * _direction[index];
+		  }
 
-  // delta0 = deltaNew
-  // float delta0 = deltaNew;
 
   // While deltaNew > (eps^2) * delta0
   const float eps  = SOLVER_ACCURACY;
+  //while ((i < _iterations) && (deltaNew > eps*delta0))
   float maxR = 2.0f * eps;
-  while ((i < _iterations) && (maxR > eps))
+  // while (i < _iterations)
+  while ((i < _iterations) && (maxR > 0.001*eps))
   {
-    // q = Ad
+
+	float alpha = 0.0f;
+
     index = _slabSize + _xRes + 1;
     for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
       for (y = 1; y < _yRes - 1; y++, index += 2)
@@ -292,233 +256,71 @@ void FLUID_3D::solvePressure(float* field, float* b, unsigned char* skip)
             if (!skip[index - _xRes]) Acenter += 1.;
             if (!skip[index + _slabSize]) Acenter += 1.;
             if (!skip[index - _slabSize]) Acenter += 1.;
-          }
-          
-          _q[index] = Acenter * _direction[index] +  
+
+			_q[index] = Acenter * _direction[index] +  
             _direction[index - 1] * (skip[index - 1] ? 0.0 : -1.0f) + 
             _direction[index + 1] * (skip[index + 1] ? 0.0 : -1.0f) +
             _direction[index - _xRes] * (skip[index - _xRes] ? 0.0 : -1.0f) + 
             _direction[index + _xRes] * (skip[index + _xRes] ? 0.0 : -1.0f)+
             _direction[index - _slabSize] * (skip[index - _slabSize] ? 0.0 : -1.0f) + 
             _direction[index + _slabSize] * (skip[index + _slabSize] ? 0.0 : -1.0f);
-          _q[index] = (skip[index]) ? 0.0f : _q[index];
-        }
-
-    // alpha = deltaNew / (transpose(d) * q)
-    float alpha = 0.0f;
-    index = _slabSize + _xRes + 1;
-    for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
-      for (y = 1; y < _yRes - 1; y++, index += 2)
-        for (x = 1; x < _xRes - 1; x++, index++)
-          alpha += _direction[index] * _q[index];
-    if (fabs(alpha) > 0.0f)
-      alpha = deltaNew / alpha;
-
-    // x = x + alpha * d
-    index = _slabSize + _xRes + 1;
-    for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
-      for (y = 1; y < _yRes - 1; y++, index += 2)
-        for (x = 1; x < _xRes - 1; x++, index++)
-          field[index] += alpha * _direction[index];
-
-    // r = r - alpha * q
-    maxR = 0.0f;
-    index = _slabSize + _xRes + 1;
-    for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
-      for (y = 1; y < _yRes - 1; y++, index += 2)
-        for (x = 1; x < _xRes - 1; x++, index++)
-        {
-          _residual[index] -= alpha * _q[index];
-          maxR = (_residual[index] > maxR) ? _residual[index] : maxR;
-        }
-
-    // deltaOld = deltaNew
-    float deltaOld = deltaNew;
-
-    // deltaNew = transpose(r) * r
-    deltaNew = 0.0f;
-    index = _slabSize + _xRes + 1;
-    for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
-      for (y = 1; y < _yRes - 1; y++, index += 2)
-        for (x = 1; x < _xRes - 1; x++, index++)
-          deltaNew += _residual[index] * _residual[index];
-
-    // beta = deltaNew / deltaOld
-    float beta = deltaNew / deltaOld;
-
-    // d = r + beta * d
-    index = _slabSize + _xRes + 1;
-    for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
-      for (y = 1; y < _yRes - 1; y++, index += 2)
-        for (x = 1; x < _xRes - 1; x++, index++)
-          _direction[index] = _residual[index] + beta * _direction[index];
-
-    // i = i + 1
-    i++;
-  }
-  // cout << i << " iterations converged to " << maxR << endl;
-}
-#endif
-
-//////////////////////////////////////////////////////////////////////
-// solve the heat equation with CG
-//////////////////////////////////////////////////////////////////////
-void FLUID_3D::solveHeat(float* field, float* b, unsigned char* skip)
-{
-	int x, y, z;
-	size_t index;
-	const float heatConst = _dt * _heatDiffusion / (_dx * _dx);
-	float *_q, *_residual, *_direction;
-
-	// i = 0
-	int i = 0;
-
-	_residual     = new float[_totalCells]; // set 0
-	_direction    = new float[_totalCells]; // set 0
-	_q            = new float[_totalCells]; // set 0
-
-  	memset(_residual, 0, sizeof(float)*_xRes*_yRes*_zRes);
-	memset(_q, 0, sizeof(float)*_xRes*_yRes*_zRes);
-	memset(_direction, 0, sizeof(float)*_xRes*_yRes*_zRes);
+          }
+		  else
+		  {
+          _q[index] = 0.0f;
+		  }
 
-  // r = b - Ax
-  index = _slabSize + _xRes + 1;
-  for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
-    for (y = 1; y < _yRes - 1; y++, index += 2)
-      for (x = 1; x < _xRes - 1; x++, index++)
-      {
-        // if the cell is a variable
-        float Acenter = 1.0f;
-        if (!skip[index])
-        {
-          // set the matrix to the Poisson stencil in order
-          if (!skip[index + 1]) Acenter += heatConst;
-          if (!skip[index - 1]) Acenter += heatConst;
-          if (!skip[index + _xRes]) Acenter += heatConst;
-          if (!skip[index - _xRes]) Acenter += heatConst;
-          if (!skip[index + _slabSize]) Acenter += heatConst;
-          if (!skip[index - _slabSize]) Acenter += heatConst;
+		  alpha += _direction[index] * _q[index];
         }
-        
-        _residual[index] = b[index] - (Acenter * field[index] + 
-          field[index - 1] * (skip[index - 1] ? 0.0 : -heatConst) + 
-          field[index + 1] * (skip[index + 1] ? 0.0 : -heatConst) +
-          field[index - _xRes] * (skip[index - _xRes] ? 0.0 : -heatConst) + 
-          field[index + _xRes] * (skip[index + _xRes] ? 0.0 : -heatConst) +
-          field[index - _slabSize] * (skip[index - _slabSize] ? 0.0 : -heatConst) + 
-          field[index + _slabSize] * (skip[index + _slabSize] ? 0.0 : -heatConst));
-        _residual[index] = (skip[index]) ? 0.0f : _residual[index];
-      }
 
-  // d = r
-  index = _slabSize + _xRes + 1;
-  for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
-    for (y = 1; y < _yRes - 1; y++, index += 2)
-      for (x = 1; x < _xRes - 1; x++, index++)
-        _direction[index] = _residual[index];
 
-  // deltaNew = transpose(r) * r
-  float deltaNew = 0.0f;
-  index = _slabSize + _xRes + 1;
-  for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
-    for (y = 1; y < _yRes - 1; y++, index += 2)
-      for (x = 1; x < _xRes - 1; x++, index++)
-        deltaNew += _residual[index] * _residual[index];
+    if (fabs(alpha) > 0.0f)
+      alpha = deltaNew / alpha;
 
-  // delta0 = deltaNew
-  // float delta0 = deltaNew;
+	float deltaOld = deltaNew;
+	deltaNew = 0.0f;
 
-  // While deltaNew > (eps^2) * delta0
-  const float eps  = SOLVER_ACCURACY;
-  float maxR = 2.0f * eps;
-  while ((i < _iterations) && (maxR > eps))
-  {
-    // q = Ad
-    index = _slabSize + _xRes + 1;
-    for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
-      for (y = 1; y < _yRes - 1; y++, index += 2)
-        for (x = 1; x < _xRes - 1; x++, index++)
-        {
-          // if the cell is a variable
-          float Acenter = 1.0f;
-          if (!skip[index])
-          {
-            // set the matrix to the Poisson stencil in order
-            if (!skip[index + 1]) Acenter += heatConst;
-            if (!skip[index - 1]) Acenter += heatConst;
-            if (!skip[index + _xRes]) Acenter += heatConst;
-            if (!skip[index - _xRes]) Acenter += heatConst;
-            if (!skip[index + _slabSize]) Acenter += heatConst;
-            if (!skip[index - _slabSize]) Acenter += heatConst;
-          }
-
-          _q[index] = (Acenter * _direction[index] + 
-            _direction[index - 1] * (skip[index - 1] ? 0.0 : -heatConst) + 
-            _direction[index + 1] * (skip[index + 1] ? 0.0 : -heatConst) +
-            _direction[index - _xRes] * (skip[index - _xRes] ? 0.0 : -heatConst) + 
-            _direction[index + _xRes] * (skip[index + _xRes] ? 0.0 : -heatConst) +
-            _direction[index - _slabSize] * (skip[index - _slabSize] ? 0.0 : -heatConst) + 
-            _direction[index + _slabSize] * (skip[index + _slabSize] ? 0.0 : -heatConst));
-         
-          _q[index] = (skip[index]) ? 0.0f : _q[index];
-        }
+	maxR = 0.0;
 
-    // alpha = deltaNew / (transpose(d) * q)
-    float alpha = 0.0f;
-    index = _slabSize + _xRes + 1;
-    for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
-      for (y = 1; y < _yRes - 1; y++, index += 2)
-        for (x = 1; x < _xRes - 1; x++, index++)
-          alpha += _direction[index] * _q[index];
-    if (fabs(alpha) > 0.0f)
-      alpha = deltaNew / alpha;
+	float tmp;
 
     // x = x + alpha * d
     index = _slabSize + _xRes + 1;
     for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
       for (y = 1; y < _yRes - 1; y++, index += 2)
         for (x = 1; x < _xRes - 1; x++, index++)
+		{
           field[index] += alpha * _direction[index];
 
-    // r = r - alpha * q
-    maxR = 0.0f;
-    index = _slabSize + _xRes + 1;
-    for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
-      for (y = 1; y < _yRes - 1; y++, index += 2)
-        for (x = 1; x < _xRes - 1; x++, index++)
-        {
-          _residual[index] -= alpha * _q[index];
-          maxR = (_residual[index] > maxR) ? _residual[index] : maxR;
-        }
+		  _residual[index] -= alpha * _q[index];
 
-    // deltaOld = deltaNew
-    float deltaOld = deltaNew;
+		  _h[index] = _Precond[index] * _residual[index];
+
+		  tmp = _residual[index] * _h[index];
+		  deltaNew += tmp;
+		  maxR = (tmp > maxR) ? tmp : maxR;
+
+		}
 
-    // deltaNew = transpose(r) * r
-    deltaNew = 0.0f;
-    index = _slabSize + _xRes + 1;
-    for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
-      for (y = 1; y < _yRes - 1; y++, index += 2)
-        for (x = 1; x < _xRes - 1; x++, index++)
-          deltaNew += _residual[index] * _residual[index];
 
     // beta = deltaNew / deltaOld
     float beta = deltaNew / deltaOld;
 
-    // d = r + beta * d
+    // d = h + beta * d
     index = _slabSize + _xRes + 1;
     for (z = 1; z < _zRes - 1; z++, index += 2 * _xRes)
       for (y = 1; y < _yRes - 1; y++, index += 2)
         for (x = 1; x < _xRes - 1; x++, index++)
-          _direction[index] = _residual[index] + beta * _direction[index];
+          _direction[index] = _h[index] + beta * _direction[index];
 
     // i = i + 1
     i++;
   }
-  // cout << i << " iterations converged to " << maxR << endl;
+  // cout << i << " iterations converged to " << sqrt(maxR) << endl;
 
+	if (_h) delete[] _h;
+	if (_Precond) delete[] _Precond;
 	if (_residual) delete[] _residual;
 	if (_direction) delete[] _direction;
 	if (_q)       delete[] _q;
 }
-
diff --git a/intern/smoke/intern/FLUID_3D_STATIC.cpp b/intern/smoke/intern/FLUID_3D_STATIC.cpp
index afeca2b1faa..89ac2c74e15 100644
--- a/intern/smoke/intern/FLUID_3D_STATIC.cpp
+++ b/intern/smoke/intern/FLUID_3D_STATIC.cpp
@@ -19,6 +19,11 @@
 // FLUID_3D.cpp: implementation of the static functions of the FLUID_3D class.
 //
 //////////////////////////////////////////////////////////////////////
+// Heavy parallel optimization done. Many of the old functions now
+// take begin and end parameters and process only specified part of the data.
+// Some functions were divided into multiple ones.
+//		- MiikaH
+//////////////////////////////////////////////////////////////////////
 
 #include <zlib.h>
 #include "FLUID_3D.h"
@@ -75,11 +80,11 @@ void FLUID_3D::addSmokeTestCase(float* field, Vec3Int res)
 //////////////////////////////////////////////////////////////////////
 // set x direction to Neumann boundary conditions
 //////////////////////////////////////////////////////////////////////
-void FLUID_3D::setNeumannX(float* field, Vec3Int res)
+void FLUID_3D::setNeumannX(float* field, Vec3Int res, int zBegin, int zEnd)
 {
 	const int slabSize = res[0] * res[1];
 	int index;
-	for (int z = 0; z < res[2]; z++)
+	for (int z = zBegin; z < zEnd; z++)
 		for (int y = 0; y < res[1]; y++)
 		{
 			// left slab
@@ -93,7 +98,7 @@ void FLUID_3D::setNeumannX(float* field, Vec3Int res)
 
 	// fix, force top slab to only allow outwards flux
 	for (int y = 0; y < res[1]; y++)
-		for (int z = 0; z < res[2]; z++)
+		for (int z = zBegin; z < zEnd; z++)
 		{
 			// top slab
 			index = y * res[0] + z * slabSize;
@@ -107,11 +112,11 @@ void FLUID_3D::setNeumannX(float* field, Vec3Int res)
 //////////////////////////////////////////////////////////////////////
 // set y direction to Neumann boundary conditions
 //////////////////////////////////////////////////////////////////////
-void FLUID_3D::setNeumannY(float* field, Vec3Int res)
+void FLUID_3D::setNeumannY(float* field, Vec3Int res, int zBegin, int zEnd)
 {
 	const int slabSize = res[0] * res[1];
 	int index;
-	for (int z = 0; z < res[2]; z++)
+	for (int z = zBegin; z < zEnd; z++)
 		for (int x = 0; x < res[0]; x++)
 		{
 			// bottom slab
@@ -124,7 +129,7 @@ void FLUID_3D::setNeumannY(float* field, Vec3Int res)
 		}
 
 	// fix, force top slab to only allow outwards flux
-	for (int z = 0; z < res[2]; z++)
+	for (int z = zBegin; z < zEnd; z++)
 		for (int x = 0; x < res[0]; x++)
 		{
 			// top slab
@@ -140,22 +145,36 @@ void FLUID_3D::setNeumannY(float* field, Vec3Int res)
 //////////////////////////////////////////////////////////////////////
 // set z direction to Neumann boundary conditions
 //////////////////////////////////////////////////////////////////////
-void FLUID_3D::setNeumannZ(float* field, Vec3Int res)
+void FLUID_3D::setNeumannZ(float* field, Vec3Int res, int zBegin, int zEnd)
 {
 	const int slabSize = res[0] * res[1];
 	const int totalCells = res[0] * res[1] * res[2];
+	const int cellsslab = totalCells - slabSize;
 	int index;
+
+	index = 0;
+	if (zBegin == 0)
 	for (int y = 0; y < res[1]; y++)
-		for (int x = 0; x < res[0]; x++)
+		for (int x = 0; x < res[0]; x++, index++)
 		{
 			// front slab
-			index = x + y * res[0];
 			field[index] = field[index + 2 * slabSize];
+		}
+
+	if (zEnd == res[2])
+	{
+	index = 0;
+	int indexx = 0;
+
+	for (int y = 0; y < res[1]; y++)
+		for (int x = 0; x < res[0]; x++, index++)
+		{
 
 			// back slab
-			index += totalCells - slabSize;
-			field[index] = field[index - 2 * slabSize];
+			indexx = index + cellsslab;
+			field[indexx] = field[indexx - 2 * slabSize];
 		}
+	
 
 	// fix, force top slab to only allow outwards flux
 	for (int y = 0; y < res[1]; y++)
@@ -163,22 +182,24 @@ void FLUID_3D::setNeumannZ(float* field, Vec3Int res)
 		{
 			// top slab
 			index = x + y * res[0];
-			index += totalCells - slabSize;
+			index += cellsslab;
 			if(field[index]<0.) field[index] = 0.;
 			index -= slabSize;
 			if(field[index]<0.) field[index] = 0.;
 		}
+
+	}	// zEnd == res[2]
 		
 }
 
 //////////////////////////////////////////////////////////////////////
 // set x direction to zero
 //////////////////////////////////////////////////////////////////////
-void FLUID_3D::setZeroX(float* field, Vec3Int res)
+void FLUID_3D::setZeroX(float* field, Vec3Int res, int zBegin, int zEnd)
 {
 	const int slabSize = res[0] * res[1];
 	int index;
-	for (int z = 0; z < res[2]; z++)
+	for (int z = zBegin; z < zEnd; z++)
 		for (int y = 0; y < res[1]; y++)
 		{
 			// left slab
@@ -194,11 +215,11 @@ void FLUID_3D::setZeroX(float* field, Vec3Int res)
 //////////////////////////////////////////////////////////////////////
 // set y direction to zero
 //////////////////////////////////////////////////////////////////////
-void FLUID_3D::setZeroY(float* field, Vec3Int res)
+void FLUID_3D::setZeroY(float* field, Vec3Int res, int zBegin, int zEnd)
 {
 	const int slabSize = res[0] * res[1];
 	int index;
-	for (int z = 0; z < res[2]; z++)
+	for (int z = zBegin; z < zEnd; z++)
 		for (int x = 0; x < res[0]; x++)
 		{
 			// bottom slab
@@ -214,32 +235,44 @@ void FLUID_3D::setZeroY(float* field, Vec3Int res)
 //////////////////////////////////////////////////////////////////////
 // set z direction to zero
 //////////////////////////////////////////////////////////////////////
-void FLUID_3D::setZeroZ(float* field, Vec3Int res)
+void FLUID_3D::setZeroZ(float* field, Vec3Int res, int zBegin, int zEnd)
 {
 	const int slabSize = res[0] * res[1];
 	const int totalCells = res[0] * res[1] * res[2];
-	int index;
+
+	int index = 0;
+	if ((zBegin == 0))
 	for (int y = 0; y < res[1]; y++)
-		for (int x = 0; x < res[0]; x++)
+		for (int x = 0; x < res[0]; x++, index++)
 		{
 			// front slab
-			index = x + y * res[0];
 			field[index] = 0.0f;
+    }
 
-			// back slab
-			index += totalCells - slabSize;
-			field[index] = 0.0f;
-		}
- }
+	if (zEnd == res[2])
+	{
+		index=0;
+		int indexx=0;
+		const int cellsslab = totalCells - slabSize;
+
+		for (int y = 0; y < res[1]; y++)
+			for (int x = 0; x < res[0]; x++, index++)
+			{
 
+				// back slab
+				indexx = index + cellsslab;
+				field[indexx] = 0.0f;
+			}
+	}
+ }
 //////////////////////////////////////////////////////////////////////
 // copy grid boundary
 //////////////////////////////////////////////////////////////////////
-void FLUID_3D::copyBorderX(float* field, Vec3Int res)
+void FLUID_3D::copyBorderX(float* field, Vec3Int res, int zBegin, int zEnd)
 {
 	const int slabSize = res[0] * res[1];
 	int index;
-	for (int z = 0; z < res[2]; z++)
+	for (int z = zBegin; z < zEnd; z++)
 		for (int y = 0; y < res[1]; y++)
 		{
 			// left slab
@@ -251,12 +284,12 @@ void FLUID_3D::copyBorderX(float* field, Vec3Int res)
 			field[index] = field[index - 1];
 		}
 }
-void FLUID_3D::copyBorderY(float* field, Vec3Int res)
+void FLUID_3D::copyBorderY(float* field, Vec3Int res, int zBegin, int zEnd)
 {
 	const int slabSize = res[0] * res[1];
-	const int totalCells = res[0] * res[1] * res[2];
+	//const int totalCells = res[0] * res[1] * res[2];
 	int index;
-	for (int z = 0; z < res[2]; z++)
+	for (int z = zBegin; z < zEnd; z++)
 		for (int x = 0; x < res[0]; x++)
 		{
 			// bottom slab
@@ -267,40 +300,49 @@ void FLUID_3D::copyBorderY(float* field, Vec3Int res)
 			field[index] = field[index - res[0]];
 		}
 }
-void FLUID_3D::copyBorderZ(float* field, Vec3Int res)
+void FLUID_3D::copyBorderZ(float* field, Vec3Int res, int zBegin, int zEnd)
 {
 	const int slabSize = res[0] * res[1];
 	const int totalCells = res[0] * res[1] * res[2];
-	int index;
+	int index=0;
+
+	if ((zBegin == 0))
 	for (int y = 0; y < res[1]; y++)
-		for (int x = 0; x < res[0]; x++)
+		for (int x = 0; x < res[0]; x++, index++)
 		{
-			// front slab
-			index = x + y * res[0];
 			field[index] = field[index + slabSize]; 
+		}
+
+	if ((zEnd == res[2]))
+	{
+
+	index=0;
+	int indexx=0;
+	const int cellsslab = totalCells - slabSize;
+
+	for (int y = 0; y < res[1]; y++)
+		for (int x = 0; x < res[0]; x++, index++)
+		{
 			// back slab
-			index += totalCells - slabSize;
-			field[index] = field[index - slabSize];
+			indexx = index + cellsslab;
+			field[indexx] = field[indexx - slabSize];
 		}
+	}
 }
 
 /////////////////////////////////////////////////////////////////////
 // advect field with the semi lagrangian method
 //////////////////////////////////////////////////////////////////////
 void FLUID_3D::advectFieldSemiLagrange(const float dt, const float* velx, const float* vely,  const float* velz,
-		float* oldField, float* newField, Vec3Int res)
+		float* oldField, float* newField, Vec3Int res, int zBegin, int zEnd)
 {
 	const int xres = res[0];
 	const int yres = res[1];
 	const int zres = res[2];
 	const int slabSize = res[0] * res[1];
 
-	// scale dt up to grid resolution
-#if PARALLEL==1 && !_WIN32
-#pragma omp parallel
-#pragma omp for  schedule(static)
-#endif
-	for (int z = 0; z < zres; z++)
+
+	for (int z = zBegin; z < zEnd; z++)
 		for (int y = 0; y < yres; y++)
 			for (int x = 0; x < xres; x++)
 			{
@@ -357,50 +399,69 @@ void FLUID_3D::advectFieldSemiLagrange(const float dt, const float* velx, const
 			}
 }
 
+
 /////////////////////////////////////////////////////////////////////
 // advect field with the maccormack method
 //
 // comments are the pseudocode from selle's paper
 //////////////////////////////////////////////////////////////////////
-void FLUID_3D::advectFieldMacCormack(const float dt, const float* xVelocity, const float* yVelocity, const float* zVelocity, 
-				float* oldField, float* newField, float* temp1, float* temp2, Vec3Int res, const unsigned char* obstacles)
+void FLUID_3D::advectFieldMacCormack1(const float dt, const float* xVelocity, const float* yVelocity, const float* zVelocity, 
+				float* oldField, float* tempResult, Vec3Int res, int zBegin, int zEnd)
 {
-	float* phiHatN  = temp1;
-	float* phiHatN1 = temp2;
 	const int sx= res[0];
 	const int sy= res[1];
 	const int sz= res[2];
 
-	for (int x = 0; x < sx * sy * sz; x++)
-		phiHatN[x] = phiHatN1[x] = oldField[x];
+	/*for (int x = 0; x < sx * sy * sz; x++)
+		phiHatN[x] = phiHatN1[x] = oldField[x];*/	// not needed as all the values are written first
 
 	float*& phiN    = oldField;
-	float*& phiN1   = newField;
+	float*& phiN1   = tempResult;
+
+
 
 	// phiHatN1 = A(phiN)
-	advectFieldSemiLagrange(  dt, xVelocity, yVelocity, zVelocity, phiN, phiHatN1, res);
+	advectFieldSemiLagrange(  dt, xVelocity, yVelocity, zVelocity, phiN, phiN1, res, zBegin, zEnd);		// uses wide data from old field and velocities (both are whole)
+}
+
+
+
+void FLUID_3D::advectFieldMacCormack2(const float dt, const float* xVelocity, const float* yVelocity, const float* zVelocity, 
+				float* oldField, float* newField, float* tempResult, float* temp1, Vec3Int res, const unsigned char* obstacles, int zBegin, int zEnd)
+{
+	float* phiHatN  = tempResult;
+	float* t1  = temp1;
+	const int sx= res[0];
+	const int sy= res[1];
+	const int sz= res[2];
+
+	float*& phiN    = oldField;
+	float*& phiN1   = newField;
+
+
 
 	// phiHatN = A^R(phiHatN1)
-	advectFieldSemiLagrange( -1.0*dt, xVelocity, yVelocity, zVelocity, phiHatN1, phiHatN, res);
+	advectFieldSemiLagrange( -1.0*dt, xVelocity, yVelocity, zVelocity, phiHatN, t1, res, zBegin, zEnd);		// uses wide data from old field and velocities (both are whole)
 
 	// phiN1 = phiHatN1 + (phiN - phiHatN) / 2
 	const int border = 0; 
-	for (int z = border; z < sz-border; z++)
+	for (int z = zBegin+border; z < zEnd-border; z++)
 		for (int y = border; y < sy-border; y++)
 			for (int x = border; x < sx-border; x++) {
 				int index = x + y * sx + z * sx*sy;
-				phiN1[index] = phiHatN1[index] + (phiN[index] - phiHatN[index]) * 0.50f;
+				phiN1[index] = phiHatN[index] + (phiN[index] - t1[index]) * 0.50f;
 				//phiN1[index] = phiHatN1[index]; // debug, correction off
 			}
-	copyBorderX(phiN1, res);
-	copyBorderY(phiN1, res);
-	copyBorderZ(phiN1, res);
+	copyBorderX(phiN1, res, zBegin, zEnd);
+	copyBorderY(phiN1, res, zBegin, zEnd);
+	copyBorderZ(phiN1, res, zBegin, zEnd);
 
 	// clamp any newly created extrema
-	clampExtrema(dt, xVelocity, yVelocity, zVelocity, oldField, newField, res);
+	clampExtrema(dt, xVelocity, yVelocity, zVelocity, oldField, newField, res, zBegin, zEnd);		// uses wide data from old field and velocities (both are whole)
 
 	// if the error estimate was bad, revert to first order
-	clampOutsideRays(dt, xVelocity, yVelocity, zVelocity, oldField, newField, res, obstacles, phiHatN1);
+	clampOutsideRays(dt, xVelocity, yVelocity, zVelocity, oldField, newField, res, obstacles, phiHatN, zBegin, zEnd);	// phiHatN is only used at cells within thread range, so its ok
+
 } 
 
 
@@ -408,13 +469,21 @@ void FLUID_3D::advectFieldMacCormack(const float dt, const float* xVelocity, con
 // Clamp the extrema generated by the BFECC error correction
 //////////////////////////////////////////////////////////////////////
 void FLUID_3D::clampExtrema(const float dt, const float* velx, const float* vely,  const float* velz,
-		float* oldField, float* newField, Vec3Int res)
+		float* oldField, float* newField, Vec3Int res, int zBegin, int zEnd)
 {
 	const int xres= res[0];
 	const int yres= res[1];
 	const int zres= res[2];
 	const int slabSize = res[0] * res[1];
-	for (int z = 1; z < zres-1; z++)
+
+	int bb=0;
+	int bt=0;
+
+	if (zBegin == 0) {bb = 1;}
+	if (zEnd == res[2]) {bt = 1;}
+
+
+	for (int z = zBegin+bb; z < zEnd-bt; z++)
 		for (int y = 1; y < yres-1; y++)
 			for (int x = 1; x < xres-1; x++)
 			{
@@ -484,14 +553,20 @@ void FLUID_3D::clampExtrema(const float dt, const float* velx, const float* vely
 // incorrect
 //////////////////////////////////////////////////////////////////////
 void FLUID_3D::clampOutsideRays(const float dt, const float* velx, const float* vely,  const float* velz,
-				float* oldField, float* newField, Vec3Int res, const unsigned char* obstacles, const float *oldAdvection)
+				float* oldField, float* newField, Vec3Int res, const unsigned char* obstacles, const float *oldAdvection, int zBegin, int zEnd)
 {
 	const int sx= res[0];
 	const int sy= res[1];
 	const int sz= res[2];
 	const int slabSize = res[0] * res[1];
 
-	for (int z = 1; z < sz-1; z++)
+	int bb=0;
+	int bt=0;
+
+	if (zBegin == 0) {bb = 1;}
+	if (zEnd == res[2]) {bt = 1;}
+
+	for (int z = zBegin+bb; z < zEnd-bt; z++)
 		for (int y = 1; y < sy-1; y++)
 			for (int x = 1; x < sx-1; x++)
 			{
@@ -607,4 +682,3 @@ void FLUID_3D::clampOutsideRays(const float dt, const float* velx, const float*
 				}
 			} // xyz
 }
-
diff --git a/intern/smoke/intern/LU_HELPER.cpp b/intern/smoke/intern/LU_HELPER.cpp
new file mode 100644
index 00000000000..43357e6be03
--- /dev/null
+++ b/intern/smoke/intern/LU_HELPER.cpp
@@ -0,0 +1,136 @@
+
+#include "LU_HELPER.h"
+
+int isNonsingular (sLU LU_) {
+      for (int j = 0; j < 3; j++) {
+         if (LU_.values[j][j] == 0)
+            return 0;
+      }
+      return 1;
+}
+
+sLU computeLU( float a[3][3])
+{
+	sLU result;
+	int m=3;
+	int n=3;
+
+	//float LU_[3][3];
+	for (int i = 0; i < m; i++) {
+			result.piv[i] = i;
+			for (int j = 0; j < n; j++) result.values[i][j]=a[i][j];
+      }
+
+      result.pivsign = 1;
+      //Real *LUrowi = 0;;
+      //Array1D<Real> LUcolj(m);
+	  //float *LUrowi = 0;
+	  float LUcolj[3];
+
+      // Outer loop.
+
+      for (int j = 0; j < n; j++) {
+
+         // Make a copy of the j-th column to localize references.
+
+         for (int i = 0; i < m; i++) {
+            LUcolj[i] = result.values[i][j];
+         }
+
+         // Apply previous transformations.
+
+         for (int i = 0; i < m; i++) {
+			 //float LUrowi[3];
+			 //LUrowi = result.values[i];
+
+            // Most of the time is spent in the following dot product.
+
+            int kmax = min(i,j);
+            double s = 0.0;
+            for (int k = 0; k < kmax; k++) {
+               s += result.values[i][k]*LUcolj[k];
+            }
+
+            result.values[i][j] = LUcolj[i] -= s;
+         }
+   
+         // Find pivot and exchange if necessary.
+
+         int p = j;
+         for (int i = j+1; i < m; i++) {
+            if (abs(LUcolj[i]) > abs(LUcolj[p])) {
+               p = i;
+            }
+         }
+         if (p != j) {
+		    int k=0;
+            for (k = 0; k < n; k++) {
+               double t = result.values[p][k]; 
+			   result.values[p][k] = result.values[j][k]; 
+			   result.values[j][k] = t;
+            }
+            k = result.piv[p]; 
+			result.piv[p] = result.piv[j]; 
+			result.piv[j] = k;
+            result.pivsign = -result.pivsign;
+         }
+
+         // Compute multipliers.
+         
+         if ((j < m) && (result.values[j][j] != 0.0)) {
+            for (int i = j+1; i < m; i++) {
+               result.values[i][j] /= result.values[j][j];
+            }
+         }
+      }
+
+	  return result;
+}
+
+void solveLU3x3(sLU& A, float x[3], float b[3])
+{
+  //TNT::Array1D<float> jamaB = TNT::Array1D<float>(3, &b[0]);
+  //TNT::Array1D<float> jamaX = A.solve(jamaB);
+
+	
+  // Solve A, B
+
+	{
+      if (!isNonsingular(A)) {
+        x[0]=0.0f;
+		x[1]=0.0f;
+		x[2]=0.0f;
+		return;
+      }
+
+
+	  //Array1D<Real> Ax = permute_copy(b, piv);
+	  float Ax[3];
+
+    // permute copy: b , A.piv
+	{
+         for (int i = 0; i < 3; i++) 
+               Ax[i] = b[A.piv[i]];
+	}
+
+      // Solve L*Y = B(piv)
+      for (int k = 0; k < 3; k++) {
+         for (int i = k+1; i < 3; i++) {
+               Ax[i] -= Ax[k]*A.values[i][k];
+            }
+         }
+      
+	  // Solve U*X = Y;
+      for (int k = 2; k >= 0; k--) {
+            Ax[k] /= A.values[k][k];
+      		for (int i = 0; i < k; i++) 
+            	Ax[i] -= Ax[k]*A.values[i][k];
+      }
+     
+
+		x[0] = Ax[0];
+		x[1] = Ax[1];
+		x[2] = Ax[2];
+      return;
+	}
+}
diff --git a/intern/smoke/intern/LU_HELPER.h b/intern/smoke/intern/LU_HELPER.h
index b3f3c5a1cb4..e79b4ffa01b 100644
--- a/intern/smoke/intern/LU_HELPER.h
+++ b/intern/smoke/intern/LU_HELPER.h
@@ -17,40 +17,35 @@
 // Copyright 2008 Theodore Kim and Nils Thuerey
 // 
 //////////////////////////////////////////////////////////////////////
+// Modified to not require TNT matrix library anymore. It was very slow
+// when being run in parallel. Required TNT JAMA:LU libraries were
+// converted into independent functions.
+//		- MiikaH
+//////////////////////////////////////////////////////////////////////
 
 #ifndef LU_HELPER_H
 #define LU_HELPER_H
 
-//////////////////////////////////////////////////////////////////////
-// Helper function, compute eigenvalues of 3x3 matrix
-//////////////////////////////////////////////////////////////////////
+#include <cmath>
+#include <algorithm>
 
-#include "tnt/jama_lu.h"
+using namespace std;
 
 //////////////////////////////////////////////////////////////////////
-// LU decomposition of 3x3 non-symmetric matrix
+// Helper function, compute eigenvalues of 3x3 matrix
 //////////////////////////////////////////////////////////////////////
-JAMA::LU<float> inline computeLU3x3(
-		float a[3][3])
-{
-		TNT::Array2D<float> A = TNT::Array2D<float>(3,3, &a[0][0]);
-		JAMA::LU<float> jLU= JAMA::LU<float>(A);
-		return jLU;
-}
 
-//////////////////////////////////////////////////////////////////////
-// LU decomposition of 3x3 non-symmetric matrix
-//////////////////////////////////////////////////////////////////////
-void inline solveLU3x3(
-    JAMA::LU<float>& A,
-    float x[3],
-    float b[3])
+struct sLU
 {
-  TNT::Array1D<float> jamaB = TNT::Array1D<float>(3, &b[0]);
-  TNT::Array1D<float> jamaX = A.solve(jamaB);
+	float values[3][3];
+	int pivsign;
+	int piv[3];
+};
+
+
+int isNonsingular (sLU LU_);
+sLU computeLU( float a[3][3]);
+void solveLU3x3(sLU& A, float x[3], float b[3]);
+
 
-  x[0] = jamaX[0];
-  x[1] = jamaX[1];
-  x[2] = jamaX[2];
-}
 #endif
diff --git a/intern/smoke/intern/WTURBULENCE.cpp b/intern/smoke/intern/WTURBULENCE.cpp
index 7ea4bde3884..6d43bc95471 100644
--- a/intern/smoke/intern/WTURBULENCE.cpp
+++ b/intern/smoke/intern/WTURBULENCE.cpp
@@ -18,6 +18,10 @@
 // 
 // WTURBULENCE handling
 ///////////////////////////////////////////////////////////////////////////////////
+// Parallelized turbulence even further. TNT matrix library functions
+// rewritten to improve performance.
+//		- MiikaH
+//////////////////////////////////////////////////////////////////////
 
 #include "WTURBULENCE.h"
 #include "INTERPOLATE.h"
@@ -29,6 +33,7 @@
 #include "LU_HELPER.h"
 #include "SPHERE.h"
 #include <zlib.h>
+#include <math.h>
 
 // needed to access static advection functions
 #include "FLUID_3D.h"
@@ -278,27 +283,34 @@ static float minDz(int x, int y, int z, float* input, Vec3Int res)
 // Beware -- uses big density maccormack as temporary arrays
 ////////////////////////////////////////////////////////////////////// 
 void WTURBULENCE::advectTextureCoordinates (float dtOrg, float* xvel, float* yvel, float* zvel, float *tempBig1, float *tempBig2) {
+
   // advection
   SWAP_POINTERS(_tcTemp, _tcU);
-  FLUID_3D::copyBorderX(_tcTemp, _resSm);
-  FLUID_3D::copyBorderY(_tcTemp, _resSm);
-  FLUID_3D::copyBorderZ(_tcTemp, _resSm);
-  FLUID_3D::advectFieldMacCormack(dtOrg, xvel, yvel, zvel, 
-      _tcTemp, _tcU, tempBig1, tempBig2, _resSm, NULL);
+  FLUID_3D::copyBorderX(_tcTemp, _resSm, 0 , _resSm[2]);
+  FLUID_3D::copyBorderY(_tcTemp, _resSm, 0 , _resSm[2]);
+  FLUID_3D::copyBorderZ(_tcTemp, _resSm, 0 , _resSm[2]);
+  FLUID_3D::advectFieldMacCormack1(dtOrg, xvel, yvel, zvel, 
+      _tcTemp, tempBig1, _resSm, 0 , _resSm[2]);
+  FLUID_3D::advectFieldMacCormack2(dtOrg, xvel, yvel, zvel, 
+      _tcTemp, _tcU, tempBig1, tempBig2, _resSm, NULL, 0 , _resSm[2]);
 
   SWAP_POINTERS(_tcTemp, _tcV);
-  FLUID_3D::copyBorderX(_tcTemp, _resSm);
-  FLUID_3D::copyBorderY(_tcTemp, _resSm);
-  FLUID_3D::copyBorderZ(_tcTemp, _resSm);
-  FLUID_3D::advectFieldMacCormack(dtOrg, xvel, yvel, zvel, 
-      _tcTemp, _tcV, tempBig1, tempBig2, _resSm, NULL);
+  FLUID_3D::copyBorderX(_tcTemp, _resSm, 0 , _resSm[2]);
+  FLUID_3D::copyBorderY(_tcTemp, _resSm, 0 , _resSm[2]);
+  FLUID_3D::copyBorderZ(_tcTemp, _resSm, 0 , _resSm[2]);
+  FLUID_3D::advectFieldMacCormack1(dtOrg, xvel, yvel, zvel, 
+      _tcTemp, tempBig1, _resSm, 0 , _resSm[2]);
+  FLUID_3D::advectFieldMacCormack2(dtOrg, xvel, yvel, zvel, 
+      _tcTemp, _tcV, tempBig1, tempBig2, _resSm, NULL, 0 , _resSm[2]);
 
   SWAP_POINTERS(_tcTemp, _tcW);
-  FLUID_3D::copyBorderX(_tcTemp, _resSm);
-  FLUID_3D::copyBorderY(_tcTemp, _resSm);
-  FLUID_3D::copyBorderZ(_tcTemp, _resSm);
-  FLUID_3D::advectFieldMacCormack(dtOrg, xvel, yvel, zvel, 
-      _tcTemp, _tcW, tempBig1, tempBig2, _resSm, NULL);
+  FLUID_3D::copyBorderX(_tcTemp, _resSm, 0 , _resSm[2]);
+  FLUID_3D::copyBorderY(_tcTemp, _resSm, 0 , _resSm[2]);
+  FLUID_3D::copyBorderZ(_tcTemp, _resSm, 0 , _resSm[2]);
+  FLUID_3D::advectFieldMacCormack1(dtOrg, xvel, yvel, zvel, 
+      _tcTemp, tempBig1, _resSm, 0 , _resSm[2]);
+  FLUID_3D::advectFieldMacCormack2(dtOrg, xvel, yvel, zvel, 
+      _tcTemp, _tcW, tempBig1, tempBig2, _resSm, NULL, 0 , _resSm[2]);
 }
 
 //////////////////////////////////////////////////////////////////////
@@ -325,9 +337,9 @@ void WTURBULENCE::computeEigenvalues(float *_eigMin, float *_eigMax) {
 
         // ONLY compute the eigenvalues after checking that the matrix
         // is nonsingular
-        JAMA::LU<float> LU = computeLU3x3(jacobian);
+        sLU LU = computeLU(jacobian);
 
-        if (LU.isNonsingular())
+        if (isNonsingular(LU))
         {
           // get the analytic eigenvalues, quite slow right now...
           Vec3 eigenvalues = Vec3(1.);
@@ -422,9 +434,9 @@ void WTURBULENCE::computeEnergy(float *_energy, float* xvel, float* yvel, float*
   for (int x = 0; x < _totalCellsSm; x++) 
     _energy[x] = 0.5f * (xvel[x] * xvel[x] + yvel[x] * yvel[x] + zvel[x] * zvel[x]);
 
-  FLUID_3D::copyBorderX(_energy, _resSm);
-  FLUID_3D::copyBorderY(_energy, _resSm);
-  FLUID_3D::copyBorderZ(_energy, _resSm);
+  FLUID_3D::copyBorderX(_energy, _resSm, 0 , _resSm[2]);
+  FLUID_3D::copyBorderY(_energy, _resSm, 0 , _resSm[2]);
+  FLUID_3D::copyBorderZ(_energy, _resSm, 0 , _resSm[2]);
 
   // pseudo-march the values into the obstacles
   // the wavelet upsampler only uses a 3x3 support neighborhood, so
@@ -563,7 +575,7 @@ Vec3 WTURBULENCE::WVelocityWithJacobian(Vec3 orgPos, float* xUnwarped, float* yU
 //////////////////////////////////////////////////////////////////////
 // perform an actual noise advection step
 //////////////////////////////////////////////////////////////////////
-void WTURBULENCE::stepTurbulenceReadable(float dtOrg, float* xvel, float* yvel, float* zvel, unsigned char *obstacles) 
+/*void WTURBULENCE::stepTurbulenceReadable(float dtOrg, float* xvel, float* yvel, float* zvel, unsigned char *obstacles) 
 {
 	// enlarge timestep to match grid
 	const float dt = dtOrg * _amplify;
@@ -689,9 +701,9 @@ void WTURBULENCE::stepTurbulenceReadable(float dtOrg, float* xvel, float* yvel,
   const float dtSubdiv = dt / (float)totalSubsteps;
 
   // set boundaries of big velocity grid
-  FLUID_3D::setZeroX(bigUx, _resBig); 
-  FLUID_3D::setZeroY(bigUy, _resBig); 
-  FLUID_3D::setZeroZ(bigUz, _resBig);
+  FLUID_3D::setZeroX(bigUx, _resBig, 0, _resBig[2]); 
+  FLUID_3D::setZeroY(bigUy, _resBig, 0, _resBig[2]); 
+  FLUID_3D::setZeroZ(bigUz, _resBig, 0, _resBig[2]);
 
   // do the MacCormack advection, with substepping if necessary
   for(int substep = 0; substep < totalSubsteps; substep++)
@@ -704,7 +716,7 @@ void WTURBULENCE::stepTurbulenceReadable(float dtOrg, float* xvel, float* yvel,
   } // substep
   
   // wipe the density borders
-  FLUID_3D::setZeroBorder(_densityBig, _resBig);
+  FLUID_3D::setZeroBorder(_densityBig, _resBig, 0, _resBig[2]);
     
   // reset texture coordinates now in preparation for next timestep
   // Shouldn't do this before generating the noise because then the 
@@ -724,7 +736,9 @@ void WTURBULENCE::stepTurbulenceReadable(float dtOrg, float* xvel, float* yvel,
   
 
   _totalStepsBig++;
-}
+}*/
+
+//struct
 
 //////////////////////////////////////////////////////////////////////
 // perform the full turbulence algorithm, including OpenMP 
@@ -747,6 +761,7 @@ void WTURBULENCE::stepTurbulenceFull(float dtOrg, float* xvel, float* yvel, floa
 
 	memset(_tcTemp, 0, sizeof(float)*_totalCellsSm);
 
+
 	// prepare textures
 	advectTextureCoordinates(dtOrg, xvel,yvel,zvel, tempBig1, tempBig2);
 
@@ -763,25 +778,38 @@ void WTURBULENCE::stepTurbulenceFull(float dtOrg, float* xvel, float* yvel, floa
 		if (obstacles[x]) highFreqEnergy[x] = 0.f;
 
 	Vec3Int ressm(_xResSm, _yResSm, _zResSm);
-	FLUID_3D::setNeumannX(highFreqEnergy, ressm);
-	FLUID_3D::setNeumannY(highFreqEnergy, ressm);
-	FLUID_3D::setNeumannZ(highFreqEnergy, ressm);
+	FLUID_3D::setNeumannX(highFreqEnergy, ressm, 0 , ressm[2]);
+	FLUID_3D::setNeumannY(highFreqEnergy, ressm, 0 , ressm[2]);
+	FLUID_3D::setNeumannZ(highFreqEnergy, ressm, 0 , ressm[2]);
+
+
+   int threadval = 1;
+#if PARALLEL==1
+  threadval = omp_get_max_threads();
+#endif
+
 
   // parallel region setup
-  float maxVelMagThreads[8] = { -1., -1., -1., -1., -1., -1., -1., -1. };
-#if PARALLEL==1 && !_WIN32
+  // Uses omp_get_max_trheads to get number of required cells.
+  float* maxVelMagThreads = new float[threadval];
+
+  for (int i=0; i<threadval; i++) maxVelMagThreads[i] = -1.0f;
+
+#if PARALLEL==1
+
 #pragma omp parallel
 #endif
   { float maxVelMag1 = 0.;
-#if PARALLEL==1 && !_WIN32
+#if PARALLEL==1
     const int id  = omp_get_thread_num(); /*, num = omp_get_num_threads(); */
 #endif
 
   // vector noise main loop
-#if PARALLEL==1 && !_WIN32
-#pragma omp for  schedule(static)
+#if PARALLEL==1
+#pragma omp for schedule(static,1)
 #endif
-  for (int zSmall = 0; zSmall < _zResSm; zSmall++) 
+  for (int zSmall = 0; zSmall < _zResSm; zSmall++)
+  {
   for (int ySmall = 0; ySmall < _yResSm; ySmall++) 
   for (int xSmall = 0; xSmall < _xResSm; xSmall++)
   {
@@ -796,14 +824,14 @@ void WTURBULENCE::stepTurbulenceFull(float dtOrg, float* xvel, float* yvel, floa
 
     // get LU factorization of texture jacobian and apply 
     // it to unit vectors
-    JAMA::LU<float> LU = computeLU3x3(jacobian);
+    sLU LU = computeLU(jacobian);
     float xUnwarped[] = {1.0f, 0.0f, 0.0f};
     float yUnwarped[] = {0.0f, 1.0f, 0.0f};
     float zUnwarped[] = {0.0f, 0.0f, 1.0f};
     float xWarped[] = {1.0f, 0.0f, 0.0f};
     float yWarped[] = {0.0f, 1.0f, 0.0f};
     float zWarped[] = {0.0f, 0.0f, 1.0f};
-    bool nonSingular = LU.isNonsingular();
+    bool nonSingular = isNonsingular(LU);
 #if 0
 	// UNUSED
     float eigMax = 10.0f;
@@ -908,24 +936,27 @@ void WTURBULENCE::stepTurbulenceFull(float dtOrg, float* xvel, float* yvel, floa
           obstacles, posSm[0], posSm[1], posSm[2], _xResSm, _yResSm, _zResSm); 
       if (obsCheck > 0.95) 
         bigUx[index] = bigUy[index] = bigUz[index] = 0.;
-    } // xyz
+    } // xyz*/
 
-#if PARALLEL==1 && !_WIN32
+#if PARALLEL==1
     maxVelMagThreads[id] = maxVelMag1;
 #else
     maxVelMagThreads[0] = maxVelMag1;
 #endif
+  }
   }
   } // omp
   
   // compute maximum over threads
   float maxVelMag = maxVelMagThreads[0];
-#if PARALLEL==1 && !_WIN32
-  for (int i = 1; i < 8; i++) 
+#if PARALLEL==1
+  for (int i = 1; i < threadval; i++) 
     if (maxVelMag < maxVelMagThreads[i]) 
       maxVelMag = maxVelMagThreads[i];
+  delete [] maxVelMagThreads;
 #endif
 
+
   // prepare density for an advection
   SWAP_POINTERS(_densityBig, _densityBigOld);
 
@@ -941,15 +972,56 @@ void WTURBULENCE::stepTurbulenceFull(float dtOrg, float* xvel, float* yvel, floa
   const float dtSubdiv = dt / (float)totalSubsteps;
 
   // set boundaries of big velocity grid
-  FLUID_3D::setZeroX(bigUx, _resBig); 
-  FLUID_3D::setZeroY(bigUy, _resBig); 
-  FLUID_3D::setZeroZ(bigUz, _resBig);
+  FLUID_3D::setZeroX(bigUx, _resBig, 0 , _resBig[2]); 
+  FLUID_3D::setZeroY(bigUy, _resBig, 0 , _resBig[2]); 
+  FLUID_3D::setZeroZ(bigUz, _resBig, 0 , _resBig[2]);
+
+#if PARALLEL==1
+  int stepParts = threadval*2;	// Dividing parallelized sections into numOfThreads * 2 sections
+  float partSize = (float)_zResBig/stepParts;	// Size of one part;
+
+  if (partSize < 4) {stepParts = threadval;					// If the slice gets too low (might actually slow things down, change it to larger
+					partSize = (float)_zResBig/stepParts;}
+  if (partSize < 4) {stepParts = (int)(ceil((float)_zResBig/4.0f));	// If it's still too low (only possible on future systems with +24 cores), change it to 4
+					partSize = (float)_zResBig/stepParts;}
+#else
+  int zBegin=0;
+  int zEnd=_resBig[2];
+#endif
 
   // do the MacCormack advection, with substepping if necessary
   for(int substep = 0; substep < totalSubsteps; substep++)
   {
-    FLUID_3D::advectFieldMacCormack(dtSubdiv, bigUx, bigUy, bigUz, 
-        _densityBigOld, _densityBig, tempBig1, tempBig2, _resBig, NULL);
+
+#if PARALLEL==1
+	#pragma omp parallel
+	{
+
+	#pragma omp for schedule(static,1)
+	for (int i=0; i<stepParts; i++)
+	{
+		int zBegin = (int)((float)i*partSize + 0.5f);
+		int zEnd = (int)((float)(i+1)*partSize + 0.5f);
+#endif
+		FLUID_3D::advectFieldMacCormack1(dtSubdiv, bigUx, bigUy, bigUz, 
+		    _densityBigOld, tempBig1, _resBig, zBegin, zEnd);
+#if PARALLEL==1
+	}
+
+	#pragma omp barrier
+
+	#pragma omp for schedule(static,1)
+	for (int i=0; i<stepParts; i++)
+	{
+		int zBegin = (int)((float)i*partSize + 0.5f);
+		int zEnd = (int)((float)(i+1)*partSize + 0.5f);
+#endif
+		FLUID_3D::advectFieldMacCormack2(dtSubdiv, bigUx, bigUy, bigUz, 
+		    _densityBigOld, _densityBig, tempBig1, tempBig2, _resBig, NULL, zBegin, zEnd);
+#if PARALLEL==1
+	}
+	}
+#endif
 
     if (substep < totalSubsteps - 1) 
       SWAP_POINTERS(_densityBig, _densityBigOld);
@@ -964,7 +1036,7 @@ void WTURBULENCE::stepTurbulenceFull(float dtOrg, float* xvel, float* yvel, floa
   free(highFreqEnergy);
   
   // wipe the density borders
-  FLUID_3D::setZeroBorder(_densityBig, _resBig);
+  FLUID_3D::setZeroBorder(_densityBig, _resBig, 0 , _resBig[2]);
     
   // reset texture coordinates now in preparation for next timestep
   // Shouldn't do this before generating the noise because then the 
diff --git a/intern/smoke/intern/smoke_API.cpp b/intern/smoke/intern/smoke_API.cpp
index 2d1d590fcc0..c6a3985c8d0 100644
--- a/intern/smoke/intern/smoke_API.cpp
+++ b/intern/smoke/intern/smoke_API.cpp
@@ -30,6 +30,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <math.h>
 
 // y in smoke is z in blender
 extern "C" FLUID_3D *smoke_init(int *res, float *p0, float dt)
@@ -110,8 +111,8 @@ extern "C" void smoke_dissolve(FLUID_3D *fluid, int speed, int log)
 
 			heat[i] *= (1.0 - dydx);
 
-			if(heat[i] < 0.0f)
-				heat[i] = 0.0f;
+			/*if(heat[i] < 0.0f)
+				heat[i] = 0.0f;*/
 		}
 	}
 	else // linear falloff
@@ -127,10 +128,9 @@ extern "C" void smoke_dissolve(FLUID_3D *fluid, int speed, int log)
 			if(density[i] < 0.0f)
 				density[i] = 0.0f;
 
-			heat[i] -= dydx;
-
-			if(heat[i] < 0.0f)
-				heat[i] = 0.0f;
+			if(abs(heat[i]) < dydx) heat[i] = 0.0f;
+			else if (heat[i]>0.0f) heat[i] -= dydx;
+			else if (heat[i]<0.0f) heat[i] += dydx;
 				
 		}
 	}
-- 
cgit v1.2.3