51 files changed, 13799 insertions, 3 deletions
diff --git a/SConstruct b/SConstruct
index 8ff9d4f41eb..b5a6ce231bf 100644
--- a/SConstruct
+++ b/SConstruct
@@ -962,7 +962,8 @@ def blender_libs(env):
 		'blender_LOD',
 		'blender_BSP',
 		'blender_blenkernel',
-		'blender_IK'])
+		'blender_IK',
+		'blender_ONL'])
 
 def ketsji_libs(env):
 	"""
diff --git a/intern/Makefile b/intern/Makefile
index 08ab03150fe..af64e44cdf4 100644
--- a/intern/Makefile
+++ b/intern/Makefile
@@ -35,7 +35,7 @@ SOURCEDIR = intern
 # include nan_subdirs.mk
 
 ALLDIRS = string ghost guardedalloc bmfont moto container memutil
-ALLDIRS += decimation iksolver bsp SoundSystem
+ALLDIRS += decimation iksolver bsp SoundSystem opennl
 
 all::
 	@for i in $(ALLDIRS); do \
diff --git a/intern/SConscript b/intern/SConscript
index 833a0316634..afbcd24b8be 100644
--- a/intern/SConscript
+++ b/intern/SConscript
@@ -8,7 +8,8 @@ SConscript(['SoundSystem/SConscript',
             'container/SConscript',
             'memutil/SConscript/',
             'decimation/SConscript',
-            'iksolver/SConscript'])
+            'iksolver/SConscript',
+            'opennl/SConscript'])
 
 NEW_CSG='false'
 
diff --git a/intern/opennl/Makefile b/intern/opennl/Makefile
new file mode 100644
index 00000000000..8aa0d4f590b
--- /dev/null
+++ b/intern/opennl/Makefile
@@ -0,0 +1,67 @@
+#
+# $Id$
+#
+# ***** BEGIN GPL/BL DUAL LICENSE BLOCK *****
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version. The Blender
+# Foundation also sells licenses for use in proprietary software under
+# the Blender License.  See http://www.blender.org/BL/ for information
+# about this.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software Foundation,
+# Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#
+# The Original Code is Copyright (C) 2001-2002 by NaN Holding BV.
+# All rights reserved.
+#
+# The Original Code is: all of this file.
+#
+# Contributor(s): Hans Lambermont
+#
+# ***** END GPL/BL DUAL LICENSE BLOCK *****
+# opennl main makefile.
+#
+
+include nan_definitions.mk
+
+LIBNAME = opennl
+LIBNAME_SLU = superlu
+SOURCEDIR = intern/$(LIBNAME)
+SOURCEDIR_SLU = intern/$(LIBNAME_SLU)
+DIR = $(OCGDIR)/$(SOURCEDIR)
+DIR_SLU = $(OCGDIR)/$(SOURCEDIR_SLU)
+DIRS = intern superlu
+
+include nan_subdirs.mk
+
+install: all debug
+	@[ -d $(NAN_OPENNL) ] || mkdir $(NAN_OPENNL)
+	@[ -d $(NAN_OPENNL)/include ] || mkdir $(NAN_OPENNL)/include
+	@[ -d $(NAN_OPENNL)/lib ] || mkdir $(NAN_OPENNL)/lib
+	@[ -d $(NAN_OPENNL)/lib/debug ] || mkdir $(NAN_OPENNL)/lib/debug
+	@../tools/cpifdiff.sh $(DIR)/libopennl.a $(NAN_OPENNL)/lib/
+	@../tools/cpifdiff.sh $(DIR)/debug/libopennl.a $(NAN_OPENNL)/lib/debug/
+ifeq ($(OS),darwin)
+	ranlib $(NAN_OPENNL)/lib/libopennl.a
+	ranlib $(NAN_OPENNL)/lib/debug/libopennl.a
+endif
+	@../tools/cpifdiff.sh extern/*.h $(NAN_OPENNL)/include/
+	@[ -d $(NAN_SUPERLU) ] || mkdir $(NAN_SUPERLU)
+	@[ -d $(NAN_SUPERLU)/lib ] || mkdir $(NAN_SUPERLU)/lib
+	@[ -d $(NAN_SUPERLU)/lib/debug ] || mkdir $(NAN_SUPERLU)/lib/debug
+	@../tools/cpifdiff.sh $(DIR_SLU)/libsuperlu.a $(NAN_SUPERLU)/lib/
+	@../tools/cpifdiff.sh $(DIR_SLU)/debug/libsuperlu.a $(NAN_SUPERLU)/lib/debug/
+ifeq ($(OS),darwin)
+	ranlib $(NAN_SUPERLU)/lib/libsuperlu.a
+	ranlib $(NAN_SUPERLU)/lib/debug/libsuperlu.a
+endif
+
diff --git a/intern/opennl/SConscript b/intern/opennl/SConscript
new file mode 100644
index 00000000000..4e0260c7f33
--- /dev/null
+++ b/intern/opennl/SConscript
@@ -0,0 +1,43 @@
+Import ('user_options_dict')
+Import ('library_env')
+
+opennl_env = library_env.Copy ()
+
+source_files = ['intern/opennl.c',
+                'superlu/colamd.c',
+                'superlu/get_perm_c.c',
+                'superlu/heap_relax_snode.c',
+                'superlu/lsame.c',
+                'superlu/memory.c',
+                'superlu/mmd.c',
+                'superlu/relax_snode.c',
+                'superlu/scolumn_bmod.c',
+                'superlu/scolumn_dfs.c',
+                'superlu/scopy_to_ucol.c',
+                'superlu/sgssv.c',
+                'superlu/sgstrf.c',
+                'superlu/sgstrs.c',
+                'superlu/smemory.c',
+                'superlu/smyblas2.c',
+                'superlu/sp_coletree.c',
+                'superlu/sp_ienv.c',
+                'superlu/sp_preorder.c',
+                'superlu/spanel_bmod.c',
+                'superlu/spanel_dfs.c',
+                'superlu/spivotL.c',
+                'superlu/spruneL.c',
+                'superlu/ssnode_bmod.c',
+                'superlu/ssnode_dfs.c',
+                'superlu/ssp_blas2.c',
+                'superlu/ssp_blas3.c',
+                'superlu/strsv.c',
+                'superlu/superlu_timer.c',
+                'superlu/sutil.c',
+                'superlu/util.c',
+                'superlu/xerbla.c']
+
+opennl_env.Append (CPPPATH = ['extern',
+                              'superlu'])
+
+opennl_env.Library (target='#'+user_options_dict['BUILD_DIR']+'/lib/blender_ONL', source=source_files)
+
diff --git a/intern/opennl/doc/OpenNL_License.txt b/intern/opennl/doc/OpenNL_License.txt
new file mode 100644
index 00000000000..4e8d97fd526
--- /dev/null
+++ b/intern/opennl/doc/OpenNL_License.txt
@@ -0,0 +1,341 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+                          675 Mass Ave, Cambridge, MA 02139, USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+
+		     END OF TERMS AND CONDITIONS
+
+	Appendix: How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) 19yy  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) 19yy name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
+
diff --git a/intern/opennl/doc/OpenNL_Readme.txt b/intern/opennl/doc/OpenNL_Readme.txt
new file mode 100644
index 00000000000..e6aea3c0286
--- /dev/null
+++ b/intern/opennl/doc/OpenNL_Readme.txt
@@ -0,0 +1,13 @@
+
+This is OpenNL, a library to easily construct and solve sparse linear systems.
+* OpenNL is supplied with a set of iterative solvers (Conjugate gradient,
+  BICGSTAB, GMRes) and preconditioners (Jacobi, SSOR). 
+* OpenNL can also use other solvers (SuperLU 3.0 supported as an OpenNL
+  extension)
+
+Note that to be compatible with OpenNL, SuperLU 3.0 needs to be compiled with
+the following flag (see make.inc in SuperLU3.0):
+CDEFS = -DAdd_ (the default is -DAdd__, just remove the second underscore)
+
+OpenNL was modified for Blender to be used only as a wrapper for SuperLU.
+
diff --git a/intern/opennl/doc/SuperLU_License.txt b/intern/opennl/doc/SuperLU_License.txt
new file mode 100644
index 00000000000..f31a01782e2
--- /dev/null
+++ b/intern/opennl/doc/SuperLU_License.txt
@@ -0,0 +1,31 @@
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+Redistribution and use in source and binary forms, with or without
+modification,
+are permitted provided that the following conditions are met: 
+
+(1) Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer. 
+(2) Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution. 
+(3) Neither the name of Lawrence Berkeley National Laboratory, U.S. Dept. of
+Energy nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ 
diff --git a/intern/opennl/doc/SuperLU_Readme.txt b/intern/opennl/doc/SuperLU_Readme.txt
new file mode 100644
index 00000000000..c1cedd09893
--- /dev/null
+++ b/intern/opennl/doc/SuperLU_Readme.txt
@@ -0,0 +1,52 @@
+		SuperLU (Version 3.0)
+		=====================
+
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met: 
+
+(1) Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer. 
+(2) Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution. 
+(3) Neither the name of Lawrence Berkeley National Laboratory, U.S. Dept. of
+Energy nor the names of its contributors may be used to endorse or promote
+products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+  
+
+SuperLU contains a set of subroutines to solve a sparse linear system 
+A*X=B. It uses Gaussian elimination with partial pivoting (GEPP). 
+The columns of A may be preordered before factorization; the 
+preordering for sparsity is completely separate from the factorization.
+
+SuperLU is implemented in ANSI C, and must be compiled with standard 
+ANSI C compilers. It provides functionality for both real and complex
+matrices, in both single and double precision. The file names for the 
+single-precision real version start with letter "s" (such as sgstrf.c);
+the file names for the double-precision real version start with letter "d"
+(such as dgstrf.c); the file names for the single-precision complex
+version start with letter "c" (such as cgstrf.c); the file names
+for the double-precision complex version start with letter "z" 
+(such as zgstrf.c).
+
+SuperLU was modified for Blender to only include single-precision
+functionality.
+
diff --git a/intern/opennl/extern/ONL_opennl.h b/intern/opennl/extern/ONL_opennl.h
new file mode 100644
index 00000000000..5e4bd24313c
--- /dev/null
+++ b/intern/opennl/extern/ONL_opennl.h
@@ -0,0 +1,163 @@
+/*
+ *  $Id$
+ *
+ *  OpenNL: Numerical Library
+ *  Copyright (C) 2004 Bruno Levy
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ *  If you modify this software, you should include a notice giving the
+ *  name of the person performing the modification, the date of modification,
+ *  and the reason for such modification.
+ *
+ *  Contact: Bruno Levy
+ *
+ *     levy@loria.fr
+ *
+ *     ISA Project
+ *     LORIA, INRIA Lorraine, 
+ *     Campus Scientifique, BP 239
+ *     54506 VANDOEUVRE LES NANCY CEDEX 
+ *     FRANCE
+ *
+ *  Note that the GNU General Public License does not permit incorporating
+ *  the Software into proprietary programs. 
+ */
+
+/*
+#define NL_DEBUG
+#define NL_PARANOID
+*/
+
+#define NL_USE_SUPERLU
+
+#ifndef nlOPENNL_H
+#define nlOPENNL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define NL_VERSION_0_0 1
+
+/*
+ *
+ * Datatypes
+ *
+ */
+
+typedef unsigned int	NLenum;
+typedef unsigned char	NLboolean;
+typedef unsigned int	NLbitfield;
+typedef void		NLvoid;
+typedef signed char	NLbyte;		/* 1-byte signed */
+typedef short		NLshort;	/* 2-byte signed */
+typedef int		NLint;		/* 4-byte signed */
+typedef unsigned char	NLubyte;	/* 1-byte unsigned */
+typedef unsigned short	NLushort;	/* 2-byte unsigned */
+typedef unsigned int	NLuint;		/* 4-byte unsigned */
+typedef int		NLsizei;	/* 4-byte signed */
+typedef float		NLfloat;	/* single precision float */
+typedef double		NLdouble;	/* double precision float */
+
+typedef void* NLContext ;
+
+/*
+ *
+ * Constants
+ *
+ */
+
+#define NL_FALSE   0x0
+#define NL_TRUE    0x1
+
+/* Primitives */
+
+#define NL_SYSTEM  0x0
+#define NL_MATRIX  0x1
+#define NL_ROW     0x2
+
+/* Solver Parameters */
+
+#define NL_SOLVER           0x100
+#define NL_NB_VARIABLES     0x101
+#define NL_LEAST_SQUARES    0x102
+#define NL_SYMMETRIC        0x106
+#define NL_ERROR            0x108
+
+/* Enable / Disable */
+
+#define NL_NORMALIZE_ROWS  0x400
+
+/* Row parameters */
+
+#define NL_RIGHT_HAND_SIDE 0x500
+#define NL_ROW_SCALING     0x501
+
+/*
+ * Contexts
+ */
+    NLContext nlNewContext() ;
+    void nlDeleteContext(NLContext context) ;
+    void nlMakeCurrent(NLContext context) ;
+    NLContext nlGetCurrent() ;
+
+/*
+ * State set/get
+ */
+
+    void nlSolverParameterf(NLenum pname, NLfloat param) ;
+    void nlSolverParameteri(NLenum pname, NLint param) ;
+
+    void nlRowParameterf(NLenum pname, NLfloat param) ;
+    void nlRowParameteri(NLenum pname, NLint param) ;
+
+    void nlGetBooleanv(NLenum pname, NLboolean* params) ;
+    void nlGetFloatv(NLenum pname, NLfloat* params) ;
+    void nlGetIntergerv(NLenum pname, NLint* params) ;
+
+    void nlEnable(NLenum pname) ;
+    void nlDisable(NLenum pname) ;
+    NLboolean nlIsEnabled(NLenum pname) ;
+
+/*
+ * Variables
+ */
+    void nlSetVariable(NLuint index, NLfloat value) ;
+    NLfloat nlGetVariable(NLuint index) ;
+    void nlLockVariable(NLuint index) ;
+    void nlUnlockVariable(NLuint index) ;
+    NLboolean nlVariableIsLocked(NLuint index) ;
+
+/*
+ * Begin/End
+ */
+
+    void nlBegin(NLenum primitive) ;
+    void nlEnd(NLenum primitive) ;
+    void nlCoefficient(NLuint index, NLfloat value) ;
+
+/*
+ * Solve
+ */
+
+    NLboolean nlSolve() ;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/intern/opennl/intern/Makefile b/intern/opennl/intern/Makefile
new file mode 100644
index 00000000000..2e57905d931
--- /dev/null
+++ b/intern/opennl/intern/Makefile
@@ -0,0 +1,43 @@
+#
+# $Id$
+#
+# ***** BEGIN GPL/BL DUAL LICENSE BLOCK *****
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version. The Blender
+# Foundation also sells licenses for use in proprietary software under
+# the Blender License.  See http://www.blender.org/BL/ for information
+# about this.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software Foundation,
+# Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#
+# The Original Code is Copyright (C) 2001-2002 by NaN Holding BV.
+# All rights reserved.
+#
+# The Original Code is: all of this file.
+#
+# Contributor(s): none yet.
+#
+# ***** END GPL/BL DUAL LICENSE BLOCK *****
+# opennl intern Makefile
+#
+
+LIBNAME = opennl
+DIR = $(OCGDIR)/intern/$(LIBNAME)
+
+include nan_compile.mk
+
+CCFLAGS += $(NAN_LEVEL_2_CPP_WARNINGS)
+
+CPPFLAGS += -I../superlu -I../extern
+
+
diff --git a/intern/opennl/intern/opennl.c b/intern/opennl/intern/opennl.c
new file mode 100644
index 00000000000..be797223f51
--- /dev/null
+++ b/intern/opennl/intern/opennl.c
@@ -0,0 +1,1151 @@
+/*
+ *  $Id$
+ *
+ *  OpenNL: Numerical Library
+ *  Copyright (C) 2004 Bruno Levy
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ *  If you modify this software, you should include a notice giving the
+ *  name of the person performing the modification, the date of modification,
+ *  and the reason for such modification.
+ *
+ *  Contact: Bruno Levy
+ *
+ *     levy@loria.fr
+ *
+ *     ISA Project
+ *     LORIA, INRIA Lorraine, 
+ *     Campus Scientifique, BP 239
+ *     54506 VANDOEUVRE LES NANCY CEDEX 
+ *     FRANCE
+ *
+ *  Note that the GNU General Public License does not permit incorporating
+ *  the Software into proprietary programs. 
+ */
+
+#include "ONL_opennl.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+#ifdef NL_PARANOID
+#ifndef NL_DEBUG
+#define NL_DEBUG
+#endif
+#endif
+
+/* SuperLU includes */
+#include <ssp_defs.h>
+#include <util.h>
+
+/************************************************************************************/
+/* Assertions */
+
+
+static void __nl_assertion_failed(char* cond, char* file, int line) {
+    fprintf(
+        stderr, 
+        "OpenNL assertion failed: %s, file:%s, line:%d\n",
+        cond,file,line
+    ) ;
+    abort() ;
+}
+
+static void __nl_range_assertion_failed(
+    float x, float min_val, float max_val, char* file, int line
+) {
+    fprintf(
+        stderr, 
+        "OpenNL range assertion failed: %f in [ %f ... %f ], file:%s, line:%d\n",
+        x, min_val, max_val, file,line
+    ) ;
+    abort() ;
+}
+
+static void __nl_should_not_have_reached(char* file, int line) {
+    fprintf(
+        stderr, 
+        "OpenNL should not have reached this point: file:%s, line:%d\n",
+        file,line
+    ) ;
+    abort() ;
+}
+
+
+#define __nl_assert(x) {                                        \
+    if(!(x)) {                                                  \
+        __nl_assertion_failed(#x,__FILE__, __LINE__) ;          \
+    }                                                           \
+} 
+
+#define __nl_range_assert(x,min_val,max_val) {                  \
+    if(((x) < (min_val)) || ((x) > (max_val))) {                \
+        __nl_range_assertion_failed(x, min_val, max_val,        \
+            __FILE__, __LINE__                                  \
+        ) ;                                                     \
+    }                                                           \
+}
+
+#define __nl_assert_not_reached {                               \
+    __nl_should_not_have_reached(__FILE__, __LINE__) ;          \
+}
+
+#ifdef NL_DEBUG
+#define __nl_debug_assert(x) __nl_assert(x)
+#define __nl_debug_range_assert(x,min_val,max_val) __nl_range_assert(x,min_val,max_val)
+#else
+#define __nl_debug_assert(x) 
+#define __nl_debug_range_assert(x,min_val,max_val) 
+#endif
+
+#ifdef NL_PARANOID
+#define __nl_parano_assert(x) __nl_assert(x)
+#define __nl_parano_range_assert(x,min_val,max_val) __nl_range_assert(x,min_val,max_val)
+#else
+#define __nl_parano_assert(x) 
+#define __nl_parano_range_assert(x,min_val,max_val) 
+#endif
+
+/************************************************************************************/
+/* classic macros */
+
+#ifndef MIN
+#define MIN(x,y) (((x) < (y)) ? (x) : (y)) 
+#endif
+
+#ifndef MAX
+#define MAX(x,y) (((x) > (y)) ? (x) : (y)) 
+#endif
+
+/************************************************************************************/
+/* memory management */
+
+#define __NL_NEW(T)                (T*)(calloc(1, sizeof(T))) 
+#define __NL_NEW_ARRAY(T,NB)       (T*)(calloc((NB),sizeof(T))) 
+#define __NL_RENEW_ARRAY(T,x,NB)   (T*)(realloc(x,(NB)*sizeof(T))) 
+#define __NL_DELETE(x)             free(x); x = NULL 
+#define __NL_DELETE_ARRAY(x)       free(x); x = NULL
+
+#define __NL_CLEAR(T, x)           memset(x, 0, sizeof(T)) 
+#define __NL_CLEAR_ARRAY(T,x,NB)   memset(x, 0, (NB)*sizeof(T)) 
+
+/************************************************************************************/
+/* Dynamic arrays for sparse row/columns */
+
+typedef struct {
+    NLuint   index ;
+    NLfloat value ;
+} __NLCoeff ;
+
+typedef struct {
+    NLuint size ;
+    NLuint capacity ;
+    __NLCoeff* coeff ;
+} __NLRowColumn ;
+
+static void __nlRowColumnConstruct(__NLRowColumn* c) {
+    c->size     = 0 ;
+    c->capacity = 0 ;
+    c->coeff    = NULL ;
+}
+
+static void __nlRowColumnDestroy(__NLRowColumn* c) {
+    __NL_DELETE_ARRAY(c->coeff) ;
+#ifdef NL_PARANOID
+    __NL_CLEAR(__NLRowColumn, c) ; 
+#endif
+}
+
+static void __nlRowColumnGrow(__NLRowColumn* c) {
+    if(c->capacity != 0) {
+        c->capacity = 2 * c->capacity ;
+        c->coeff = __NL_RENEW_ARRAY(__NLCoeff, c->coeff, c->capacity) ;
+    } else {
+        c->capacity = 4 ;
+        c->coeff = __NL_NEW_ARRAY(__NLCoeff, c->capacity) ;
+    }
+}
+
+static void __nlRowColumnAdd(__NLRowColumn* c, NLint index, NLfloat value) {
+    NLuint i ;
+    for(i=0; i<c->size; i++) {
+        if(c->coeff[i].index == (NLuint)index) {
+            c->coeff[i].value += value ;
+            return ;
+        }
+    }
+    if(c->size == c->capacity) {
+        __nlRowColumnGrow(c) ;
+    }
+    c->coeff[c->size].index = index ;
+    c->coeff[c->size].value = value ;
+    c->size++ ;
+}
+
+/* Does not check whether the index already exists */
+static void __nlRowColumnAppend(__NLRowColumn* c, NLint index, NLfloat value) {
+    if(c->size == c->capacity) {
+        __nlRowColumnGrow(c) ;
+    }
+    c->coeff[c->size].index = index ;
+    c->coeff[c->size].value = value ;
+    c->size++ ;
+}
+
+static void __nlRowColumnZero(__NLRowColumn* c) {
+    c->size = 0 ;
+}
+
+static void __nlRowColumnClear(__NLRowColumn* c) {
+    c->size     = 0 ;
+    c->capacity = 0 ;
+    __NL_DELETE_ARRAY(c->coeff) ;
+}
+
+/************************************************************************************/
+/* SparseMatrix data structure */
+
+#define __NL_ROWS      1
+#define __NL_COLUMNS   2
+#define __NL_SYMMETRIC 4
+
+typedef struct {
+    NLuint m ;
+    NLuint n ;
+    NLuint diag_size ;
+    NLenum storage ;
+    __NLRowColumn* row ;
+    __NLRowColumn* column ;
+    NLfloat*      diag ;
+} __NLSparseMatrix ;
+
+
+static void __nlSparseMatrixConstruct(
+    __NLSparseMatrix* M, NLuint m, NLuint n, NLenum storage
+) {
+    NLuint i ;
+    M->m = m ;
+    M->n = n ;
+    M->storage = storage ;
+    if(storage & __NL_ROWS) {
+        M->row = __NL_NEW_ARRAY(__NLRowColumn, m) ;
+        for(i=0; i<n; i++) {
+            __nlRowColumnConstruct(&(M->row[i])) ;
+        }
+    } else {
+        M->row = NULL ;
+    }
+
+    if(storage & __NL_COLUMNS) {
+        M->column = __NL_NEW_ARRAY(__NLRowColumn, n) ;
+        for(i=0; i<n; i++) {
+            __nlRowColumnConstruct(&(M->column[i])) ;
+        }
+    } else {
+        M->column = NULL ;
+    }
+
+    M->diag_size = MIN(m,n) ;
+    M->diag = __NL_NEW_ARRAY(NLfloat, M->diag_size) ;
+}
+
+static void __nlSparseMatrixDestroy(__NLSparseMatrix* M) {
+    NLuint i ;
+    __NL_DELETE_ARRAY(M->diag) ;
+    if(M->storage & __NL_ROWS) {
+        for(i=0; i<M->m; i++) {
+            __nlRowColumnDestroy(&(M->row[i])) ;
+        }
+        __NL_DELETE_ARRAY(M->row) ;
+    }
+    if(M->storage & __NL_COLUMNS) {
+        for(i=0; i<M->n; i++) {
+            __nlRowColumnDestroy(&(M->column[i])) ;
+        }
+        __NL_DELETE_ARRAY(M->column) ;
+    }
+#ifdef NL_PARANOID
+    __NL_CLEAR(__NLSparseMatrix,M) ;
+#endif
+}
+
+static void __nlSparseMatrixAdd(
+    __NLSparseMatrix* M, NLuint i, NLuint j, NLfloat value
+) {
+    __nl_parano_range_assert(i, 0, M->m - 1) ;
+    __nl_parano_range_assert(j, 0, M->n - 1) ;
+    if((M->storage & __NL_SYMMETRIC) && (j > i)) {
+        return ;
+    }
+    if(i == j) {
+        M->diag[i] += value ;
+    }
+    if(M->storage & __NL_ROWS) {
+        __nlRowColumnAdd(&(M->row[i]), j, value) ;
+    }
+    if(M->storage & __NL_COLUMNS) {
+        __nlRowColumnAdd(&(M->column[j]), i, value) ;
+    }
+}
+
+static void __nlSparseMatrixClear( __NLSparseMatrix* M) {
+    NLuint i ;
+    if(M->storage & __NL_ROWS) {
+        for(i=0; i<M->m; i++) {
+            __nlRowColumnClear(&(M->row[i])) ;
+        }
+    }
+    if(M->storage & __NL_COLUMNS) {
+        for(i=0; i<M->n; i++) {
+            __nlRowColumnClear(&(M->column[i])) ;
+        }
+    }
+    __NL_CLEAR_ARRAY(NLfloat, M->diag, M->diag_size) ;    
+}
+
+/* Returns the number of non-zero coefficients */
+static NLuint __nlSparseMatrixNNZ( __NLSparseMatrix* M) {
+    NLuint nnz = 0 ;
+    NLuint i ;
+    if(M->storage & __NL_ROWS) {
+        for(i = 0; i<M->m; i++) {
+            nnz += M->row[i].size ;
+        }
+    } else if (M->storage & __NL_COLUMNS) {
+        for(i = 0; i<M->n; i++) {
+            nnz += M->column[i].size ;
+        }
+    } else {
+        __nl_assert_not_reached ;
+    }
+    return nnz ;
+}
+
+/************************************************************************************/
+/* SparseMatrix x Vector routines, internal helper routines */
+
+static void __nlSparseMatrix_mult_rows_symmetric(
+    __NLSparseMatrix* A, NLfloat* x, NLfloat* y
+) {
+    NLuint m = A->m ;
+    NLuint i,ij ;
+    __NLRowColumn* Ri = NULL ;
+    __NLCoeff* c = NULL ;
+    for(i=0; i<m; i++) {
+        y[i] = 0 ;
+        Ri = &(A->row[i]) ;
+        for(ij=0; ij<Ri->size; ij++) {
+            c = &(Ri->coeff[ij]) ;
+            y[i] += c->value * x[c->index] ;
+            if(i != c->index) {
+                y[c->index] += c->value * x[i] ;
+            }
+        }
+    }
+}
+
+static void __nlSparseMatrix_mult_rows(
+    __NLSparseMatrix* A, NLfloat* x, NLfloat* y
+) {
+    NLuint m = A->m ;
+    NLuint i,ij ;
+    __NLRowColumn* Ri = NULL ;
+    __NLCoeff* c = NULL ;
+    for(i=0; i<m; i++) {
+        y[i] = 0 ;
+        Ri = &(A->row[i]) ;
+        for(ij=0; ij<Ri->size; ij++) {
+            c = &(Ri->coeff[ij]) ;
+            y[i] += c->value * x[c->index] ;
+        }
+    }
+}
+
+static void __nlSparseMatrix_mult_cols_symmetric(
+    __NLSparseMatrix* A, NLfloat* x, NLfloat* y
+) {
+    NLuint n = A->n ;
+    NLuint j,ii ;
+    __NLRowColumn* Cj = NULL ;
+    __NLCoeff* c = NULL ;
+    for(j=0; j<n; j++) {
+        y[j] = 0 ;
+        Cj = &(A->column[j]) ;
+        for(ii=0; ii<Cj->size; ii++) {
+            c = &(Cj->coeff[ii]) ;
+            y[c->index] += c->value * x[j] ;
+            if(j != c->index) {
+                y[j] += c->value * x[c->index] ;
+            }
+        }
+    }
+}
+
+static void __nlSparseMatrix_mult_cols(
+    __NLSparseMatrix* A, NLfloat* x, NLfloat* y
+) {
+    NLuint n = A->n ;
+    NLuint j,ii ; 
+    __NLRowColumn* Cj = NULL ;
+    __NLCoeff* c = NULL ;
+    __NL_CLEAR_ARRAY(NLfloat, y, A->m) ;
+    for(j=0; j<n; j++) {
+        Cj = &(A->column[j]) ;
+        for(ii=0; ii<Cj->size; ii++) {
+            c = &(Cj->coeff[ii]) ;
+            y[c->index] += c->value * x[j] ;
+        }
+    }
+}
+
+/************************************************************************************/
+/* SparseMatrix x Vector routines, main driver routine */
+
+void __nlSparseMatrixMult(__NLSparseMatrix* A, NLfloat* x, NLfloat* y) {
+    if(A->storage & __NL_ROWS) {
+        if(A->storage & __NL_SYMMETRIC) {
+            __nlSparseMatrix_mult_rows_symmetric(A, x, y) ;
+        } else {
+            __nlSparseMatrix_mult_rows(A, x, y) ;
+        }
+    } else {
+        if(A->storage & __NL_SYMMETRIC) {
+            __nlSparseMatrix_mult_cols_symmetric(A, x, y) ;
+        } else {
+            __nlSparseMatrix_mult_cols(A, x, y) ;
+        }
+    }
+}
+
+/************************************************************************************/
+/* NLContext data structure */
+
+typedef void(*__NLMatrixFunc)(float* x, float* y) ;
+
+typedef struct {
+    NLfloat  value ;
+    NLboolean locked ;
+    NLuint    index ;
+} __NLVariable ;
+
+#define __NL_STATE_INITIAL            0
+#define __NL_STATE_SYSTEM             1
+#define __NL_STATE_MATRIX             2
+#define __NL_STATE_ROW                3
+#define __NL_STATE_MATRIX_CONSTRUCTED 4
+#define __NL_STATE_SYSTEM_CONSTRUCTED 5
+#define __NL_STATE_SOLVED             6
+
+typedef struct {
+    NLenum           state ;
+    __NLVariable*    variable ;
+    NLuint           n ;
+    __NLSparseMatrix M ;
+    __NLRowColumn    af ;
+    __NLRowColumn    al ;
+    __NLRowColumn    xl ;
+    NLfloat*        x ;
+    NLfloat*        b ;
+    NLfloat         right_hand_side ;
+    NLfloat         row_scaling ;
+    NLuint           nb_variables ;
+    NLuint           current_row ;
+    NLboolean        least_squares ;
+    NLboolean        symmetric ;
+    NLboolean        normalize_rows ;
+    NLboolean        alloc_M ;
+    NLboolean        alloc_af ;
+    NLboolean        alloc_al ;
+    NLboolean        alloc_xl ;
+    NLboolean        alloc_variable ;
+    NLboolean        alloc_x ;
+    NLboolean        alloc_b ;
+    NLfloat         error ;
+    __NLMatrixFunc   matrix_vector_prod ;
+} __NLContext ;
+
+static __NLContext* __nlCurrentContext = NULL ;
+
+void __nlMatrixVectorProd_default(NLfloat* x, NLfloat* y) {
+    __nlSparseMatrixMult(&(__nlCurrentContext->M), x, y) ;
+}
+
+
+NLContext nlNewContext() {
+    __NLContext* result      = __NL_NEW(__NLContext) ;
+    result->state            = __NL_STATE_INITIAL ;
+    result->row_scaling      = 1.0 ;
+    result->right_hand_side  = 0.0 ;
+    result->matrix_vector_prod = __nlMatrixVectorProd_default ;
+    nlMakeCurrent(result) ;
+    return result ;
+}
+
+void nlDeleteContext(NLContext context_in) {
+    __NLContext* context = (__NLContext*)(context_in) ;
+    if(__nlCurrentContext == context) {
+        __nlCurrentContext = NULL ;
+    }
+    if(context->alloc_M) {
+        __nlSparseMatrixDestroy(&context->M) ;
+    }
+    if(context->alloc_af) {
+        __nlRowColumnDestroy(&context->af) ;
+    }
+    if(context->alloc_al) {
+        __nlRowColumnDestroy(&context->al) ;
+    }
+    if(context->alloc_xl) {
+        __nlRowColumnDestroy(&context->xl) ;
+    }
+    if(context->alloc_variable) {
+        __NL_DELETE_ARRAY(context->variable) ;
+    }
+    if(context->alloc_x) {
+        __NL_DELETE_ARRAY(context->x) ;
+    }
+    if(context->alloc_b) {
+        __NL_DELETE_ARRAY(context->b) ;
+    }
+
+#ifdef NL_PARANOID
+    __NL_CLEAR(__NLContext, context) ;
+#endif
+    __NL_DELETE(context) ;
+}
+
+void nlMakeCurrent(NLContext context) {
+    __nlCurrentContext = (__NLContext*)(context) ;
+}
+
+NLContext nlGetCurrent() {
+    return __nlCurrentContext ;
+}
+
+void __nlCheckState(NLenum state) {
+    __nl_assert(__nlCurrentContext->state == state) ;
+}
+
+void __nlTransition(NLenum from_state, NLenum to_state) {
+    __nlCheckState(from_state) ;
+    __nlCurrentContext->state = to_state ;
+}
+
+/************************************************************************************/
+/* Get/Set parameters */
+
+void nlSolverParameterf(NLenum pname, NLfloat param) {
+    __nlCheckState(__NL_STATE_INITIAL) ;
+    switch(pname) {
+    case NL_NB_VARIABLES: {
+        __nl_assert(param > 0) ;
+        __nlCurrentContext->nb_variables = (NLuint)param ;
+    } break ;
+    case NL_LEAST_SQUARES: {
+        __nlCurrentContext->least_squares = (NLboolean)param ;
+    } break ;
+    case NL_SYMMETRIC: {
+        __nlCurrentContext->symmetric = (NLboolean)param ;        
+    }
+    default: {
+        __nl_assert_not_reached ;
+    } break ;
+    }
+}
+
+void nlSolverParameteri(NLenum pname, NLint param) {
+    __nlCheckState(__NL_STATE_INITIAL) ;
+    switch(pname) {
+    case NL_NB_VARIABLES: {
+        __nl_assert(param > 0) ;
+        __nlCurrentContext->nb_variables = (NLuint)param ;
+    } break ;
+    case NL_LEAST_SQUARES: {
+        __nlCurrentContext->least_squares = (NLboolean)param ;
+    } break ;
+    case NL_SYMMETRIC: {
+        __nlCurrentContext->symmetric = (NLboolean)param ;        
+    }
+    default: {
+        __nl_assert_not_reached ;
+    } break ;
+    }
+}
+
+void nlRowParameterf(NLenum pname, NLfloat param) {
+    __nlCheckState(__NL_STATE_MATRIX) ;
+    switch(pname) {
+    case NL_RIGHT_HAND_SIDE: {
+        __nlCurrentContext->right_hand_side = param ;
+    } break ;
+    case NL_ROW_SCALING: {
+        __nlCurrentContext->row_scaling = param ;
+    } break ;
+    }
+}
+
+void nlRowParameteri(NLenum pname, NLint param) {
+    __nlCheckState(__NL_STATE_MATRIX) ;
+    switch(pname) {
+    case NL_RIGHT_HAND_SIDE: {
+        __nlCurrentContext->right_hand_side = (NLfloat)param ;
+    } break ;
+    case NL_ROW_SCALING: {
+        __nlCurrentContext->row_scaling = (NLfloat)param ;
+    } break ;
+    }
+}
+
+void nlGetBooleanv(NLenum pname, NLboolean* params) {
+    switch(pname) {
+    case NL_LEAST_SQUARES: {
+        *params = __nlCurrentContext->least_squares ;
+    } break ;
+    case NL_SYMMETRIC: {
+        *params = __nlCurrentContext->symmetric ;
+    } break ;
+    default: {
+        __nl_assert_not_reached ;
+    } break ;
+    }
+}
+
+void nlGetFloatv(NLenum pname, NLfloat* params) {
+    switch(pname) {
+    case NL_NB_VARIABLES: {
+        *params = (NLfloat)(__nlCurrentContext->nb_variables) ;
+    } break ;
+    case NL_LEAST_SQUARES: {
+        *params = (NLfloat)(__nlCurrentContext->least_squares) ;
+    } break ;
+    case NL_SYMMETRIC: {
+        *params = (NLfloat)(__nlCurrentContext->symmetric) ;
+    } break ;
+    case NL_ERROR: {
+        *params = (NLfloat)(__nlCurrentContext->error) ;
+    } break ;
+    default: {
+        __nl_assert_not_reached ;
+    } break ;
+    }
+}
+
+void nlGetIntergerv(NLenum pname, NLint* params) {
+    switch(pname) {
+    case NL_NB_VARIABLES: {
+        *params = (NLint)(__nlCurrentContext->nb_variables) ;
+    } break ;
+    case NL_LEAST_SQUARES: {
+        *params = (NLint)(__nlCurrentContext->least_squares) ;
+    } break ;
+    case NL_SYMMETRIC: {
+        *params = (NLint)(__nlCurrentContext->symmetric) ;
+    } break ;
+    default: {
+        __nl_assert_not_reached ;
+    } break ;
+    }
+}
+
+/************************************************************************************/
+/* Enable / Disable */
+
+void nlEnable(NLenum pname) {
+    switch(pname) {
+    case NL_NORMALIZE_ROWS: {
+        __nl_assert(__nlCurrentContext->state != __NL_STATE_ROW) ;
+        __nlCurrentContext->normalize_rows = NL_TRUE ;
+    } break ;
+    default: {
+        __nl_assert_not_reached ;
+    }
+    }
+}
+
+void nlDisable(NLenum pname) {
+    switch(pname) {
+    case NL_NORMALIZE_ROWS: {
+        __nl_assert(__nlCurrentContext->state != __NL_STATE_ROW) ;
+        __nlCurrentContext->normalize_rows = NL_FALSE ;
+    } break ;
+    default: {
+        __nl_assert_not_reached ;
+    }
+    }
+}
+
+NLboolean nlIsEnabled(NLenum pname) {
+    switch(pname) {
+    case NL_NORMALIZE_ROWS: {
+        return __nlCurrentContext->normalize_rows ;
+    } break ;
+    default: {
+        __nl_assert_not_reached ;
+    }
+    }
+    return NL_FALSE ;
+}
+
+/************************************************************************************/
+/* Get/Set Lock/Unlock variables */
+
+void nlSetVariable(NLuint index, NLfloat value) {
+    __nlCheckState(__NL_STATE_SYSTEM) ;
+    __nl_parano_range_assert(index, 0, __nlCurrentContext->nb_variables - 1) ;
+    __nlCurrentContext->variable[index].value = value ;    
+}
+
+NLfloat nlGetVariable(NLuint index) {
+    __nl_assert(__nlCurrentContext->state != __NL_STATE_INITIAL) ;
+    __nl_parano_range_assert(index, 0, __nlCurrentContext->nb_variables - 1) ;
+    return __nlCurrentContext->variable[index].value ;
+}
+
+void nlLockVariable(NLuint index) {
+    __nlCheckState(__NL_STATE_SYSTEM) ;
+    __nl_parano_range_assert(index, 0, __nlCurrentContext->nb_variables - 1) ;
+    __nlCurrentContext->variable[index].locked = NL_TRUE ;
+}
+
+void nlUnlockVariable(NLuint index) {
+    __nlCheckState(__NL_STATE_SYSTEM) ;
+    __nl_parano_range_assert(index, 0, __nlCurrentContext->nb_variables - 1) ;
+    __nlCurrentContext->variable[index].locked = NL_FALSE ;
+}
+
+NLboolean nlVariableIsLocked(NLuint index) {
+    __nl_assert(__nlCurrentContext->state != __NL_STATE_INITIAL) ;
+    __nl_parano_range_assert(index, 0, __nlCurrentContext->nb_variables - 1) ;
+    return __nlCurrentContext->variable[index].locked  ;
+}
+
+/************************************************************************************/
+/* System construction */
+
+void __nlVariablesToVector() {
+    NLuint i ;
+    __nl_assert(__nlCurrentContext->alloc_x) ;
+    __nl_assert(__nlCurrentContext->alloc_variable) ;
+    for(i=0; i<__nlCurrentContext->nb_variables; i++) {
+        __NLVariable* v = &(__nlCurrentContext->variable[i]) ;
+        if(!v->locked) {
+            __nl_assert(v->index < __nlCurrentContext->n) ;
+            __nlCurrentContext->x[v->index] = v->value ;
+        }
+    }
+}
+
+void __nlVectorToVariables() {
+    NLuint i ;
+    __nl_assert(__nlCurrentContext->alloc_x) ;
+    __nl_assert(__nlCurrentContext->alloc_variable) ;
+    for(i=0; i<__nlCurrentContext->nb_variables; i++) {
+        __NLVariable* v = &(__nlCurrentContext->variable[i]) ;
+        if(!v->locked) {
+            __nl_assert(v->index < __nlCurrentContext->n) ;
+            v->value = __nlCurrentContext->x[v->index] ;
+        }
+    }
+}
+
+
+void __nlBeginSystem() {
+    __nlTransition(__NL_STATE_INITIAL, __NL_STATE_SYSTEM) ;
+    __nl_assert(__nlCurrentContext->nb_variables > 0) ;
+    __nlCurrentContext->variable = __NL_NEW_ARRAY(
+        __NLVariable, __nlCurrentContext->nb_variables
+    ) ;
+    __nlCurrentContext->alloc_variable = NL_TRUE ;
+}
+
+void __nlEndSystem() {
+    __nlTransition(__NL_STATE_MATRIX_CONSTRUCTED, __NL_STATE_SYSTEM_CONSTRUCTED) ;    
+}
+
+void __nlBeginMatrix() {
+    NLuint i ;
+    NLuint n = 0 ;
+    NLenum storage = __NL_ROWS ;
+
+    __nlTransition(__NL_STATE_SYSTEM, __NL_STATE_MATRIX) ;
+
+    for(i=0; i<__nlCurrentContext->nb_variables; i++) {
+        if(!__nlCurrentContext->variable[i].locked) {
+            __nlCurrentContext->variable[i].index = n ;
+            n++ ;
+        } else {
+            __nlCurrentContext->variable[i].index = ~0 ;
+        }
+    }
+
+    __nlCurrentContext->n = n ;
+
+    /* a least squares problem results in a symmetric matrix */
+    if(__nlCurrentContext->least_squares) {
+        __nlCurrentContext->symmetric = NL_TRUE ;
+    }
+
+    if(__nlCurrentContext->symmetric) {
+        storage = (storage | __NL_SYMMETRIC) ;
+    }
+
+    /* SuperLU storage does not support symmetric storage */
+    storage = (storage & ~__NL_SYMMETRIC) ;
+
+    __nlSparseMatrixConstruct(&__nlCurrentContext->M, n, n, storage) ;
+    __nlCurrentContext->alloc_M = NL_TRUE ;
+
+    __nlCurrentContext->x = __NL_NEW_ARRAY(NLfloat, n) ;
+    __nlCurrentContext->alloc_x = NL_TRUE ;
+    
+    __nlCurrentContext->b = __NL_NEW_ARRAY(NLfloat, n) ;
+    __nlCurrentContext->alloc_b = NL_TRUE ;
+
+    __nlVariablesToVector() ;
+
+    __nlRowColumnConstruct(&__nlCurrentContext->af) ;
+    __nlCurrentContext->alloc_af = NL_TRUE ;
+    __nlRowColumnConstruct(&__nlCurrentContext->al) ;
+    __nlCurrentContext->alloc_al = NL_TRUE ;
+    __nlRowColumnConstruct(&__nlCurrentContext->xl) ;
+    __nlCurrentContext->alloc_xl = NL_TRUE ;
+
+    __nlCurrentContext->current_row = 0 ;
+}
+
+void __nlEndMatrix() {
+    __nlTransition(__NL_STATE_MATRIX, __NL_STATE_MATRIX_CONSTRUCTED) ;    
+    
+    __nlRowColumnDestroy(&__nlCurrentContext->af) ;
+    __nlCurrentContext->alloc_af = NL_FALSE ;
+    __nlRowColumnDestroy(&__nlCurrentContext->al) ;
+    __nlCurrentContext->alloc_al = NL_FALSE ;
+    __nlRowColumnDestroy(&__nlCurrentContext->xl) ;
+    __nlCurrentContext->alloc_al = NL_FALSE ;
+    
+    if(!__nlCurrentContext->least_squares) {
+        __nl_assert(
+            __nlCurrentContext->current_row == 
+            __nlCurrentContext->n
+        ) ;
+    }
+}
+
+void __nlBeginRow() {
+    __nlTransition(__NL_STATE_MATRIX, __NL_STATE_ROW) ;
+    __nlRowColumnZero(&__nlCurrentContext->af) ;
+    __nlRowColumnZero(&__nlCurrentContext->al) ;
+    __nlRowColumnZero(&__nlCurrentContext->xl) ;
+}
+
+void __nlScaleRow(NLfloat s) {
+    __NLRowColumn*    af = &__nlCurrentContext->af ;
+    __NLRowColumn*    al = &__nlCurrentContext->al ;
+    NLuint nf            = af->size ;
+    NLuint nl            = al->size ;
+    NLuint i ;
+    for(i=0; i<nf; i++) {
+        af->coeff[i].value *= s ;
+    }
+    for(i=0; i<nl; i++) {
+        al->coeff[i].value *= s ;
+    }
+    __nlCurrentContext->right_hand_side *= s ;
+}
+
+void __nlNormalizeRow(NLfloat weight) {
+    __NLRowColumn*    af = &__nlCurrentContext->af ;
+    __NLRowColumn*    al = &__nlCurrentContext->al ;
+    NLuint nf            = af->size ;
+    NLuint nl            = al->size ;
+    NLuint i ;
+    NLfloat norm = 0.0 ;
+    for(i=0; i<nf; i++) {
+        norm += af->coeff[i].value * af->coeff[i].value ;
+    }
+    for(i=0; i<nl; i++) {
+        norm += al->coeff[i].value * al->coeff[i].value ;
+    }
+    norm = sqrt(norm) ;
+    __nlScaleRow(weight / norm) ;
+}
+
+void __nlEndRow() {
+    __NLRowColumn*    af = &__nlCurrentContext->af ;
+    __NLRowColumn*    al = &__nlCurrentContext->al ;
+    __NLRowColumn*    xl = &__nlCurrentContext->xl ;
+    __NLSparseMatrix* M  = &__nlCurrentContext->M  ;
+    NLfloat* b        = __nlCurrentContext->b ;
+    NLuint nf          = af->size ;
+    NLuint nl          = al->size ;
+    NLuint current_row = __nlCurrentContext->current_row ;
+    NLuint i ;
+    NLuint j ;
+    NLfloat S ;
+    __nlTransition(__NL_STATE_ROW, __NL_STATE_MATRIX) ;
+
+    if(__nlCurrentContext->normalize_rows) {
+        __nlNormalizeRow(__nlCurrentContext->row_scaling) ;
+    } else {
+        __nlScaleRow(__nlCurrentContext->row_scaling) ;
+    }
+
+    if(__nlCurrentContext->least_squares) {
+        for(i=0; i<nf; i++) {
+            for(j=0; j<nf; j++) {
+                __nlSparseMatrixAdd(
+                    M, af->coeff[i].index, af->coeff[j].index,
+                    af->coeff[i].value * af->coeff[j].value
+                ) ;
+            }
+        }
+        S = -__nlCurrentContext->right_hand_side ;
+        for(j=0; j<nl; j++) {
+            S += al->coeff[j].value * xl->coeff[j].value ;
+        }
+        for(i=0; i<nf; i++) {
+            b[ af->coeff[i].index ] -= af->coeff[i].value * S ;
+        }
+    } else {
+        for(i=0; i<nf; i++) {
+            __nlSparseMatrixAdd(
+                M, current_row, af->coeff[i].index, af->coeff[i].value
+            ) ;
+        }
+        b[current_row] = -__nlCurrentContext->right_hand_side ;
+        for(i=0; i<nl; i++) {
+            b[current_row] -= al->coeff[i].value * xl->coeff[i].value ;
+        }
+    }
+    __nlCurrentContext->current_row++ ;
+    __nlCurrentContext->right_hand_side = 0.0 ;    
+    __nlCurrentContext->row_scaling     = 1.0 ;
+}
+
+void nlCoefficient(NLuint index, NLfloat value) {
+    __NLVariable* v;
+	unsigned int zero= 0;
+    __nlCheckState(__NL_STATE_ROW) ;
+    __nl_range_assert(index, zero, __nlCurrentContext->nb_variables - 1) ;
+    v = &(__nlCurrentContext->variable[index]) ;
+    if(v->locked) {
+        __nlRowColumnAppend(&(__nlCurrentContext->al), 0, value) ;
+        __nlRowColumnAppend(&(__nlCurrentContext->xl), 0, v->value) ;
+    } else {
+        __nlRowColumnAppend(&(__nlCurrentContext->af), v->index, value) ;
+    }
+}
+
+void nlBegin(NLenum prim) {
+    switch(prim) {
+    case NL_SYSTEM: {
+        __nlBeginSystem() ;
+    } break ;
+    case NL_MATRIX: {
+        __nlBeginMatrix() ;
+    } break ;
+    case NL_ROW: {
+        __nlBeginRow() ;
+    } break ;
+    default: {
+        __nl_assert_not_reached ;
+    }
+    }
+}
+
+void nlEnd(NLenum prim) {
+    switch(prim) {
+    case NL_SYSTEM: {
+        __nlEndSystem() ;
+    } break ;
+    case NL_MATRIX: {
+        __nlEndMatrix() ;
+    } break ;
+    case NL_ROW: {
+        __nlEndRow() ;
+    } break ;
+    default: {
+        __nl_assert_not_reached ;
+    }
+    }
+}
+
+/************************************************************************/
+/* SuperLU wrapper */
+
+/* Note: SuperLU is difficult to call, but it is worth it.    */
+/* Here is a driver inspired by A. Sheffer's "cow flattener". */
+static NLboolean __nlSolve_SUPERLU( NLboolean do_perm) {
+
+    /* OpenNL Context */
+    __NLSparseMatrix* M  = &(__nlCurrentContext->M) ;
+    NLfloat* b          = __nlCurrentContext->b ;
+    NLfloat* x          = __nlCurrentContext->x ;
+
+    /* Compressed Row Storage matrix representation */
+    NLuint    n      = __nlCurrentContext->n ;
+    NLuint    nnz    = __nlSparseMatrixNNZ(M) ; /* Number of Non-Zero coeffs */
+    NLint*    xa     = __NL_NEW_ARRAY(NLint, n+1) ;
+    NLfloat* rhs    = __NL_NEW_ARRAY(NLfloat, n) ;
+    NLfloat* a      = __NL_NEW_ARRAY(NLfloat, nnz) ;
+    NLint*    asub   = __NL_NEW_ARRAY(NLint, nnz) ;
+
+    /* Permutation vector */
+    NLint*    perm_r  = __NL_NEW_ARRAY(NLint, n) ;
+    NLint*    perm    = __NL_NEW_ARRAY(NLint, n) ;
+
+    /* SuperLU variables */
+    SuperMatrix A, B ; /* System       */
+    SuperMatrix L, U ; /* Inverse of A */
+    NLint info ;       /* status code  */
+    DNformat *vals = NULL ; /* access to result */
+    float *rvals  = NULL ; /* access to result */
+
+    /* SuperLU options and stats */
+    superlu_options_t options ;
+    SuperLUStat_t     stat ;
+
+
+    /* Temporary variables */
+    __NLRowColumn* Ri = NULL ;
+    NLuint         i,jj,count ;
+    
+    __nl_assert(!(M->storage & __NL_SYMMETRIC)) ;
+    __nl_assert(M->storage & __NL_ROWS) ;
+    __nl_assert(M->m == M->n) ;
+    
+    
+    /*
+     * Step 1: convert matrix M into SuperLU compressed column 
+     *   representation.
+     * -------------------------------------------------------
+     */
+
+    count = 0 ;
+    for(i=0; i<n; i++) {
+        Ri = &(M->row[i]) ;
+        xa[i] = count ;
+        for(jj=0; jj<Ri->size; jj++) {
+            a[count]    = Ri->coeff[jj].value ;
+            asub[count] = Ri->coeff[jj].index ;
+            count++ ;
+        }
+    }
+    xa[n] = nnz ;
+
+    /* Save memory for SuperLU */
+    __nlSparseMatrixClear(M) ;
+
+
+    /*
+     * Rem: symmetric storage does not seem to work with
+     * SuperLU ... (->deactivated in main SLS::Solver driver)
+     */
+    sCreate_CompCol_Matrix(
+        &A, n, n, nnz, a, asub, xa, 
+        SLU_NR,              /* Row_wise, no supernode */
+        SLU_S,               /* floats                */ 
+        SLU_GE               /* general storage        */
+    );
+
+    /* Step 2: create vector */
+    sCreate_Dense_Matrix(
+        &B, n, 1, b, n, 
+        SLU_DN, /* Fortran-type column-wise storage */
+        SLU_S,  /* floats                          */
+        SLU_GE  /* general                          */
+    );
+            
+
+    /* Step 3: get permutation matrix 
+     * ------------------------------
+     * com_perm: 0 -> no re-ordering
+     *           1 -> re-ordering for A^t.A
+     *           2 -> re-ordering for A^t+A
+     *           3 -> approximate minimum degree ordering
+     */
+    get_perm_c(do_perm ? 3 : 0, &A, perm) ;
+
+    /* Step 4: call SuperLU main routine
+     * ---------------------------------
+     */
+
+    set_default_options(&options) ;
+    options.ColPerm = MY_PERMC ;
+    StatInit(&stat) ;
+
+    sgssv(&options, &A, perm, perm_r, &L, &U, &B, &stat, &info);
+
+    /* Step 5: get the solution
+     * ------------------------
+     * Fortran-type column-wise storage
+     */
+    vals = (DNformat*)B.Store;
+    rvals = (float*)(vals->nzval);
+    if(info == 0) {
+        for(i = 0; i <  n; i++){
+            x[i] = rvals[i];
+        }
+    }
+
+    /* Step 6: cleanup
+     * ---------------
+     */
+
+    /*
+     *  For these two ones, only the "store" structure
+     * needs to be deallocated (the arrays have been allocated
+     * by us).
+     */
+    Destroy_SuperMatrix_Store(&A) ;
+    Destroy_SuperMatrix_Store(&B) ;
+
+    
+    /*
+     *   These ones need to be fully deallocated (they have been
+     * allocated by SuperLU).
+     */
+    Destroy_SuperNode_Matrix(&L);
+    Destroy_CompCol_Matrix(&U);
+
+    __NL_DELETE_ARRAY(xa) ;
+    __NL_DELETE_ARRAY(rhs) ;
+    __NL_DELETE_ARRAY(a) ;
+    __NL_DELETE_ARRAY(asub) ;
+    __NL_DELETE_ARRAY(perm_r) ;
+    __NL_DELETE_ARRAY(perm) ;
+
+    return (info == 0) ;
+}
+
+
+/************************************************************************/
+/* nlSolve() driver routine */
+
+NLboolean nlSolve() {
+    NLboolean result = NL_TRUE ;
+
+    __nlCheckState(__NL_STATE_SYSTEM_CONSTRUCTED) ;
+    result = __nlSolve_SUPERLU(NL_TRUE) ;
+
+    __nlVectorToVariables() ;
+    __nlTransition(__NL_STATE_SYSTEM_CONSTRUCTED, __NL_STATE_SOLVED) ;
+
+    return result ;
+}
+
diff --git a/intern/opennl/superlu/Cnames.h b/intern/opennl/superlu/Cnames.h
new file mode 100644
index 00000000000..35ff7b0b665
--- /dev/null
+++ b/intern/opennl/superlu/Cnames.h
@@ -0,0 +1,281 @@
+/*
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 1, 1997
+ *
+ */
+#ifndef __SUPERLU_CNAMES /* allow multiple inclusions */
+#define __SUPERLU_CNAMES
+
+/* We want this flag, safer than putting in build system */
+#define Add_
+
+/*
+ * These macros define how C routines will be called.  ADD_ assumes that
+ * they will be called by fortran, which expects C routines to have an
+ * underscore postfixed to the name (Suns, and the Intel expect this).
+ * NOCHANGE indicates that fortran will be calling, and that it expects
+ * the name called by fortran to be identical to that compiled by the C
+ * (RS6K's do this).  UPCASE says it expects C routines called by fortran
+ * to be in all upcase (CRAY wants this). 
+ */
+
+#define ADD_       0
+#define ADD__      1
+#define NOCHANGE   2
+#define UPCASE     3
+#define C_CALL     4
+
+#ifdef UpCase
+#define F77_CALL_C UPCASE
+#endif
+
+#ifdef NoChange
+#define F77_CALL_C NOCHANGE
+#endif
+
+#ifdef Add_
+#define F77_CALL_C ADD_
+#endif
+
+#ifdef Add__
+#define F77_CALL_C ADD__
+#endif
+
+/* Default */
+#ifndef F77_CALL_C
+#define F77_CALL_C ADD_
+#endif
+
+
+#if (F77_CALL_C == ADD_)
+/*
+ * These defines set up the naming scheme required to have a fortran 77
+ * routine call a C routine
+ * No redefinition necessary to have following Fortran to C interface:
+ *           FORTRAN CALL               C DECLARATION
+ *           call dgemm(...)           void dgemm_(...)
+ *
+ * This is the default.
+ */
+
+#endif
+
+#if (F77_CALL_C == ADD__)
+/*
+ * These defines set up the naming scheme required to have a fortran 77
+ * routine call a C routine 
+ * for following Fortran to C interface:
+ *           FORTRAN CALL               C DECLARATION
+ *           call dgemm(...)           void dgemm__(...)
+ */
+#define sasum_    sasum__
+#define isamax_   isamax__
+#define scopy_    scopy__
+#define sscal_    sscal__
+#define sger_     sger__
+#define snrm2_    snrm2__
+#define ssymv_    ssymv__
+#define sdot_     sdot__
+#define saxpy_    saxpy__
+#define ssyr2_    ssyr2__
+#define srot_     srot__
+#define sgemv_    sgemv__
+#define strsv_    strsv__
+#define sgemm_    sgemm__
+#define strsm_    strsm__
+
+#define dasum_    dasum__
+#define idamax_   idamax__
+#define dcopy_    dcopy__
+#define dscal_    dscal__
+#define dger_     dger__
+#define dnrm2_    dnrm2__
+#define dsymv_    dsymv__
+#define ddot_     ddot__
+#define daxpy_    daxpy__
+#define dsyr2_    dsyr2__
+#define drot_     drot__
+#define dgemv_    dgemv__
+#define dtrsv_    dtrsv__
+#define dgemm_    dgemm__
+#define dtrsm_    dtrsm__
+
+#define scasum_   scasum__
+#define icamax_   icamax__
+#define ccopy_    ccopy__
+#define cscal_    cscal__
+#define scnrm2_   scnrm2__
+#define caxpy_    caxpy__
+#define cgemv_    cgemv__
+#define ctrsv_    ctrsv__
+#define cgemm_    cgemm__
+#define ctrsm_    ctrsm__
+#define cgerc_    cgerc__
+#define chemv_    chemv__
+#define cher2_    cher2__
+
+#define dzasum_   dzasum__
+#define izamax_   izamax__
+#define zcopy_    zcopy__
+#define zscal_    zscal__
+#define dznrm2_   dznrm2__
+#define zaxpy_    zaxpy__
+#define zgemv_    zgemv__
+#define ztrsv_    ztrsv__
+#define zgemm_    zgemm__
+#define ztrsm_    ztrsm__
+#define zgerc_    zgerc__
+#define zhemv_    zhemv__
+#define zher2_    zher2__
+
+#define c_bridge_dgssv_ c_bridge_dgssv__
+#define c_fortran_dgssv_ c_fortran_dgssv__
+#endif
+
+#if (F77_CALL_C == UPCASE)
+/*
+ * These defines set up the naming scheme required to have a fortran 77
+ * routine call a C routine 
+ * following Fortran to C interface:
+ *           FORTRAN CALL               C DECLARATION
+ *           call dgemm(...)           void DGEMM(...)
+ */
+#define sasum_    SASUM
+#define isamax_   ISAMAX
+#define scopy_    SCOPY
+#define sscal_    SSCAL
+#define sger_     SGER
+#define snrm2_    SNRM2
+#define ssymv_    SSYMV
+#define sdot_     SDOT
+#define saxpy_    SAXPY
+#define ssyr2_    SSYR2
+#define srot_     SROT
+#define sgemv_    SGEMV
+#define strsv_    STRSV
+#define sgemm_    SGEMM
+#define strsm_    STRSM
+
+#define dasum_    SASUM
+#define idamax_   ISAMAX
+#define dcopy_    SCOPY
+#define dscal_    SSCAL
+#define dger_     SGER
+#define dnrm2_    SNRM2
+#define dsymv_    SSYMV
+#define ddot_     SDOT
+#define daxpy_    SAXPY
+#define dsyr2_    SSYR2
+#define drot_     SROT
+#define dgemv_    SGEMV
+#define dtrsv_    STRSV
+#define dgemm_    SGEMM
+#define dtrsm_    STRSM
+
+#define scasum_   SCASUM
+#define icamax_   ICAMAX
+#define ccopy_    CCOPY
+#define cscal_    CSCAL
+#define scnrm2_   SCNRM2
+#define caxpy_    CAXPY
+#define cgemv_    CGEMV
+#define ctrsv_    CTRSV
+#define cgemm_    CGEMM
+#define ctrsm_    CTRSM
+#define cgerc_    CGERC
+#define chemv_    CHEMV
+#define cher2_    CHER2
+
+#define dzasum_   SCASUM
+#define izamax_   ICAMAX
+#define zcopy_    CCOPY
+#define zscal_    CSCAL
+#define dznrm2_   SCNRM2
+#define zaxpy_    CAXPY
+#define zgemv_    CGEMV
+#define ztrsv_    CTRSV
+#define zgemm_    CGEMM
+#define ztrsm_    CTRSM
+#define zgerc_    CGERC
+#define zhemv_    CHEMV
+#define zher2_    CHER2
+
+#define c_bridge_dgssv_ C_BRIDGE_DGSSV
+#define c_fortran_dgssv_ C_FORTRAN_DGSSV
+#endif
+
+#if (F77_CALL_C == NOCHANGE)
+/*
+ * These defines set up the naming scheme required to have a fortran 77
+ * routine call a C routine 
+ * for following Fortran to C interface:
+ *           FORTRAN CALL               C DECLARATION
+ *           call dgemm(...)           void dgemm(...)
+ */
+#define sasum_    sasum
+#define isamax_   isamax
+#define scopy_    scopy
+#define sscal_    sscal
+#define sger_     sger
+#define snrm2_    snrm2
+#define ssymv_    ssymv
+#define sdot_     sdot
+#define saxpy_    saxpy
+#define ssyr2_    ssyr2
+#define srot_     srot
+#define sgemv_    sgemv
+#define strsv_    strsv
+#define sgemm_    sgemm
+#define strsm_    strsm
+
+#define dasum_    dasum
+#define idamax_   idamax
+#define dcopy_    dcopy
+#define dscal_    dscal
+#define dger_     dger
+#define dnrm2_    dnrm2
+#define dsymv_    dsymv
+#define ddot_     ddot
+#define daxpy_    daxpy
+#define dsyr2_    dsyr2
+#define drot_     drot
+#define dgemv_    dgemv
+#define dtrsv_    dtrsv
+#define dgemm_    dgemm
+#define dtrsm_    dtrsm
+
+#define scasum_   scasum
+#define icamax_   icamax
+#define ccopy_    ccopy
+#define cscal_    cscal
+#define scnrm2_   scnrm2
+#define caxpy_    caxpy
+#define cgemv_    cgemv
+#define ctrsv_    ctrsv
+#define cgemm_    cgemm
+#define ctrsm_    ctrsm
+#define cgerc_    cgerc
+#define chemv_    chemv
+#define cher2_    cher2
+
+#define dzasum_   dzasum
+#define izamax_   izamax
+#define zcopy_    zcopy
+#define zscal_    zscal
+#define dznrm2_   dznrm2
+#define zaxpy_    zaxpy
+#define zgemv_    zgemv
+#define ztrsv_    ztrsv
+#define zgemm_    zgemm
+#define ztrsm_    ztrsm
+#define zgerc_    zgerc
+#define zhemv_    zhemv
+#define zher2_    zher2
+
+#define c_bridge_dgssv_ c_bridge_dgssv
+#define c_fortran_dgssv_ c_fortran_dgssv
+#endif
+
+#endif /* __SUPERLU_CNAMES */
diff --git a/intern/opennl/superlu/Makefile b/intern/opennl/superlu/Makefile
new file mode 100644
index 00000000000..942ceebc79c
--- /dev/null
+++ b/intern/opennl/superlu/Makefile
@@ -0,0 +1,40 @@
+#
+# $Id$
+#
+# ***** BEGIN GPL/BL DUAL LICENSE BLOCK *****
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version. The Blender
+# Foundation also sells licenses for use in proprietary software under
+# the Blender License.  See http://www.blender.org/BL/ for information
+# about this.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software Foundation,
+# Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#
+# The Original Code is Copyright (C) 2001-2002 by NaN Holding BV.
+# All rights reserved.
+#
+# The Original Code is: all of this file.
+#
+# Contributor(s): none yet.
+#
+# ***** END GPL/BL DUAL LICENSE BLOCK *****
+# opennl intern Makefile
+#
+
+LIBNAME = superlu
+DIR = $(OCGDIR)/intern/$(LIBNAME)
+
+include nan_compile.mk
+
+CCFLAGS += $(NAN_LEVEL_2_CPP_WARNINGS)
+
diff --git a/intern/opennl/superlu/colamd.c b/intern/opennl/superlu/colamd.c
new file mode 100644
index 00000000000..b60718f9938
--- /dev/null
+++ b/intern/opennl/superlu/colamd.c
@@ -0,0 +1,2583 @@
+/* ========================================================================== */
+/* === colamd - a sparse matrix column ordering algorithm =================== */
+/* ========================================================================== */
+
+/*
+    colamd:  An approximate minimum degree column ordering algorithm.
+
+    Purpose:
+
+	Colamd computes a permutation Q such that the Cholesky factorization of
+	(AQ)'(AQ) has less fill-in and requires fewer floating point operations
+	than A'A.  This also provides a good ordering for sparse partial
+	pivoting methods, P(AQ) = LU, where Q is computed prior to numerical
+	factorization, and P is computed during numerical factorization via
+	conventional partial pivoting with row interchanges.  Colamd is the
+	column ordering method used in SuperLU, part of the ScaLAPACK library.
+	It is also available as user-contributed software for Matlab 5.2,
+	available from MathWorks, Inc. (http://www.mathworks.com).  This
+	routine can be used in place of COLMMD in Matlab.  By default, the \
+	and / operators in Matlab perform a column ordering (using COLMMD)
+	prior to LU factorization using sparse partial pivoting, in the
+	built-in Matlab LU(A) routine.
+
+    Authors:
+
+	The authors of the code itself are Stefan I. Larimore and Timothy A.
+	Davis (davis@cise.ufl.edu), University of Florida.  The algorithm was
+	developed in collaboration with John Gilbert, Xerox PARC, and Esmond
+	Ng, Oak Ridge National Laboratory.
+
+    Date:
+
+	August 3, 1998.  Version 1.0.
+
+    Acknowledgements:
+
+	This work was supported by the National Science Foundation, under
+	grants DMS-9504974 and DMS-9803599.
+
+    Notice:
+
+	Copyright (c) 1998 by the University of Florida.  All Rights Reserved.
+
+	THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+	EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+
+	Permission is hereby granted to use or copy this program for any
+	purpose, provided the above notices are retained on all copies.
+	User documentation of any code that uses this code must cite the
+	Authors, the Copyright, and "Used by permission."  If this code is
+	accessible from within Matlab, then typing "help colamd" or "colamd"
+	(with no arguments) must cite the Authors.  Permission to modify the
+	code and to distribute modified code is granted, provided the above
+	notices are retained, and a notice that the code was modified is
+	included with the above copyright notice.  You must also retain the
+	Availability information below, of the original version.
+
+	This software is provided free of charge.
+
+    Availability:
+
+	This file is located at
+
+		http://www.cise.ufl.edu/~davis/colamd/colamd.c
+
+	The colamd.h file is required, located in the same directory.
+	The colamdmex.c file provides a Matlab interface for colamd.
+	The symamdmex.c file provides a Matlab interface for symamd, which is
+	a symmetric ordering based on this code, colamd.c.  All codes are
+	purely ANSI C compliant (they use no Unix-specific routines, include
+	files, etc.).
+*/
+
+/* ========================================================================== */
+/* === Description of user-callable routines ================================ */
+/* ========================================================================== */
+
+/*
+    Each user-callable routine (declared as PUBLIC) is briefly described below.
+    Refer to the comments preceding each routine for more details.
+
+    ----------------------------------------------------------------------------
+    colamd_recommended:
+    ----------------------------------------------------------------------------
+
+	Usage:
+
+	    Alen = colamd_recommended (nnz, n_row, n_col) ;
+
+	Purpose:
+
+	    Returns recommended value of Alen for use by colamd.  Returns -1
+	    if any input argument is negative.
+
+	Arguments:
+
+	    int nnz ;		Number of nonzeros in the matrix A.  This must
+				be the same value as p [n_col] in the call to
+				colamd - otherwise you will get a wrong value
+				of the recommended memory to use.
+	    int n_row ;		Number of rows in the matrix A.
+	    int n_col ;		Number of columns in the matrix A.
+
+    ----------------------------------------------------------------------------
+    colamd_set_defaults:
+    ----------------------------------------------------------------------------
+
+	Usage:
+
+	    colamd_set_defaults (knobs) ;
+
+	Purpose:
+
+	    Sets the default parameters.
+
+	Arguments:
+
+	    double knobs [COLAMD_KNOBS] ;	Output only.
+
+		Rows with more than (knobs [COLAMD_DENSE_ROW] * n_col) entries
+		are removed prior to ordering.  Columns with more than
+		(knobs [COLAMD_DENSE_COL] * n_row) entries are removed
+		prior to ordering, and placed last in the output column
+		ordering.  Default values of these two knobs are both 0.5.
+		Currently, only knobs [0] and knobs [1] are used, but future
+		versions may use more knobs.  If so, they will be properly set
+		to their defaults by the future version of colamd_set_defaults,
+		so that the code that calls colamd will not need to change,
+		assuming that you either use colamd_set_defaults, or pass a
+		(double *) NULL pointer as the knobs array to colamd.
+
+    ----------------------------------------------------------------------------
+    colamd:
+    ----------------------------------------------------------------------------
+
+	Usage:
+
+	    colamd (n_row, n_col, Alen, A, p, knobs) ;
+
+	Purpose:
+
+	    Computes a column ordering (Q) of A such that P(AQ)=LU or
+	    (AQ)'AQ=LL' have less fill-in and require fewer floating point
+	    operations than factorizing the unpermuted matrix A or A'A,
+	    respectively.
+
+	Arguments:
+
+	    int n_row ;
+
+		Number of rows in the matrix A.
+		Restriction:  n_row >= 0.
+		Colamd returns FALSE if n_row is negative.
+
+	    int n_col ;
+
+		Number of columns in the matrix A.
+		Restriction:  n_col >= 0.
+		Colamd returns FALSE if n_col is negative.
+
+	    int Alen ;
+
+		Restriction (see note):
+		Alen >= 2*nnz + 6*(n_col+1) + 4*(n_row+1) + n_col + COLAMD_STATS
+		Colamd returns FALSE if these conditions are not met.
+
+		Note:  this restriction makes an modest assumption regarding
+		the size of the two typedef'd structures, below.  We do,
+		however, guarantee that
+		Alen >= colamd_recommended (nnz, n_row, n_col)
+		will be sufficient.
+
+	    int A [Alen] ;	Input argument, stats on output.
+
+		A is an integer array of size Alen.  Alen must be at least as
+		large as the bare minimum value given above, but this is very
+		low, and can result in excessive run time.  For best
+		performance, we recommend that Alen be greater than or equal to
+		colamd_recommended (nnz, n_row, n_col), which adds
+		nnz/5 to the bare minimum value given above.
+
+		On input, the row indices of the entries in column c of the
+		matrix are held in A [(p [c]) ... (p [c+1]-1)].  The row indices
+		in a given column c need not be in ascending order, and
+		duplicate row indices may be be present.  However, colamd will
+		work a little faster if both of these conditions are met
+		(Colamd puts the matrix into this format, if it finds that the
+		the conditions are not met).
+
+		The matrix is 0-based.  That is, rows are in the range 0 to
+		n_row-1, and columns are in the range 0 to n_col-1.  Colamd
+		returns FALSE if any row index is out of range.
+
+		The contents of A are modified during ordering, and are thus
+		undefined on output with the exception of a few statistics
+		about the ordering (A [0..COLAMD_STATS-1]):
+		A [0]:  number of dense or empty rows ignored.
+		A [1]:  number of dense or empty columns ignored (and ordered
+			last in the output permutation p)
+		A [2]:  number of garbage collections performed.
+		A [3]:  0, if all row indices in each column were in sorted
+			  order, and no duplicates were present.
+			1, otherwise (in which case colamd had to do more work)
+		Note that a row can become "empty" if it contains only
+		"dense" and/or "empty" columns, and similarly a column can
+		become "empty" if it only contains "dense" and/or "empty" rows.
+		Future versions may return more statistics in A, but the usage
+		of these 4 entries in A will remain unchanged.
+
+	    int p [n_col+1] ;	Both input and output argument.
+
+		p is an integer array of size n_col+1.  On input, it holds the
+		"pointers" for the column form of the matrix A.  Column c of
+		the matrix A is held in A [(p [c]) ... (p [c+1]-1)].  The first
+		entry, p [0], must be zero, and p [c] <= p [c+1] must hold
+		for all c in the range 0 to n_col-1.  The value p [n_col] is
+		thus the total number of entries in the pattern of the matrix A.
+		Colamd returns FALSE if these conditions are not met.
+
+		On output, if colamd returns TRUE, the array p holds the column
+		permutation (Q, for P(AQ)=LU or (AQ)'(AQ)=LL'), where p [0] is
+		the first column index in the new ordering, and p [n_col-1] is
+		the last.  That is, p [k] = j means that column j of A is the
+		kth pivot column, in AQ, where k is in the range 0 to n_col-1
+		(p [0] = j means that column j of A is the first column in AQ).
+
+		If colamd returns FALSE, then no permutation is returned, and
+		p is undefined on output.
+
+	    double knobs [COLAMD_KNOBS] ;	Input only.
+
+		See colamd_set_defaults for a description.  If the knobs array
+		is not present (that is, if a (double *) NULL pointer is passed
+		in its place), then the default values of the parameters are
+		used instead.
+
+*/
+
+
+/* ========================================================================== */
+/* === Include files ======================================================== */
+/* ========================================================================== */
+
+/* limits.h:  the largest positive integer (INT_MAX) */
+#include <limits.h>
+
+/* colamd.h:  knob array size, stats output size, and global prototypes */
+#include "colamd.h"
+
+/* ========================================================================== */
+/* === Scaffolding code definitions  ======================================== */
+/* ========================================================================== */
+
+/* Ensure that debugging is turned off: */
+#ifndef NDEBUG
+#define NDEBUG
+#endif
+
+/* assert.h:  the assert macro (no debugging if NDEBUG is defined) */
+#include <assert.h>
+
+/*
+   Our "scaffolding code" philosophy:  In our opinion, well-written library
+   code should keep its "debugging" code, and just normally have it turned off
+   by the compiler so as not to interfere with performance.  This serves
+   several purposes:
+
+   (1) assertions act as comments to the reader, telling you what the code
+	expects at that point.  All assertions will always be true (unless
+	there really is a bug, of course).
+
+   (2) leaving in the scaffolding code assists anyone who would like to modify
+	the code, or understand the algorithm (by reading the debugging output,
+	one can get a glimpse into what the code is doing).
+
+   (3) (gasp!) for actually finding bugs.  This code has been heavily tested
+	and "should" be fully functional and bug-free ... but you never know...
+
+    To enable debugging, comment out the "#define NDEBUG" above.  The code will
+    become outrageously slow when debugging is enabled.  To control the level of
+    debugging output, set an environment variable D to 0 (little), 1 (some),
+    2, 3, or 4 (lots).
+*/
+
+/* ========================================================================== */
+/* === Row and Column structures ============================================ */
+/* ========================================================================== */
+
+typedef struct ColInfo_struct
+{
+    int start ;		/* index for A of first row in this column, or DEAD */
+			/* if column is dead */
+    int length ;	/* number of rows in this column */
+    union
+    {
+	int thickness ;	/* number of original columns represented by this */
+			/* col, if the column is alive */
+	int parent ;	/* parent in parent tree super-column structure, if */
+			/* the column is dead */
+    } shared1 ;
+    union
+    {
+	int score ;	/* the score used to maintain heap, if col is alive */
+	int order ;	/* pivot ordering of this column, if col is dead */
+    } shared2 ;
+    union
+    {
+	int headhash ;	/* head of a hash bucket, if col is at the head of */
+			/* a degree list */
+	int hash ;	/* hash value, if col is not in a degree list */
+	int prev ;	/* previous column in degree list, if col is in a */
+			/* degree list (but not at the head of a degree list) */
+    } shared3 ;
+    union
+    {
+	int degree_next ;	/* next column, if col is in a degree list */
+	int hash_next ;		/* next column, if col is in a hash list */
+    } shared4 ;
+
+} ColInfo ;
+
+typedef struct RowInfo_struct
+{
+    int start ;		/* index for A of first col in this row */
+    int length ;	/* number of principal columns in this row */
+    union
+    {
+	int degree ;	/* number of principal & non-principal columns in row */
+	int p ;		/* used as a row pointer in init_rows_cols () */
+    } shared1 ;
+    union
+    {
+	int mark ;	/* for computing set differences and marking dead rows*/
+	int first_column ;/* first column in row (used in garbage collection) */
+    } shared2 ;
+
+} RowInfo ;
+
+/* ========================================================================== */
+/* === Definitions ========================================================== */
+/* ========================================================================== */
+
+#define MAX(a,b) (((a) > (b)) ? (a) : (b))
+#define MIN(a,b) (((a) < (b)) ? (a) : (b))
+
+#define ONES_COMPLEMENT(r) (-(r)-1)
+
+#define TRUE	(1)
+#define FALSE	(0)
+#define EMPTY	(-1)
+
+/* Row and column status */
+#define ALIVE	(0)
+#define DEAD	(-1)
+
+/* Column status */
+#define DEAD_PRINCIPAL		(-1)
+#define DEAD_NON_PRINCIPAL	(-2)
+
+/* Macros for row and column status update and checking. */
+#define ROW_IS_DEAD(r)			ROW_IS_MARKED_DEAD (Row[r].shared2.mark)
+#define ROW_IS_MARKED_DEAD(row_mark)	(row_mark < ALIVE)
+#define ROW_IS_ALIVE(r)			(Row [r].shared2.mark >= ALIVE)
+#define COL_IS_DEAD(c)			(Col [c].start < ALIVE)
+#define COL_IS_ALIVE(c)			(Col [c].start >= ALIVE)
+#define COL_IS_DEAD_PRINCIPAL(c)	(Col [c].start == DEAD_PRINCIPAL)
+#define KILL_ROW(r)			{ Row [r].shared2.mark = DEAD ; }
+#define KILL_PRINCIPAL_COL(c)		{ Col [c].start = DEAD_PRINCIPAL ; }
+#define KILL_NON_PRINCIPAL_COL(c)	{ Col [c].start = DEAD_NON_PRINCIPAL ; }
+
+/* Routines are either PUBLIC (user-callable) or PRIVATE (not user-callable) */
+#define PUBLIC
+#define PRIVATE static
+
+/* ========================================================================== */
+/* === Prototypes of PRIVATE routines ======================================= */
+/* ========================================================================== */
+
+PRIVATE int init_rows_cols
+(
+    int n_row,
+    int n_col,
+    RowInfo Row [],
+    ColInfo Col [],
+    int A [],
+    int p []
+) ;
+
+PRIVATE void init_scoring
+(
+    int n_row,
+    int n_col,
+    RowInfo Row [],
+    ColInfo Col [],
+    int A [],
+    int head [],
+    double knobs [COLAMD_KNOBS],
+    int *p_n_row2,
+    int *p_n_col2,
+    int *p_max_deg
+) ;
+
+PRIVATE int find_ordering
+(
+    int n_row,
+    int n_col,
+    int Alen,
+    RowInfo Row [],
+    ColInfo Col [],
+    int A [],
+    int head [],
+    int n_col2,
+    int max_deg,
+    int pfree
+) ;
+
+PRIVATE void order_children
+(
+    int n_col,
+    ColInfo Col [],
+    int p []
+) ;
+
+PRIVATE void detect_super_cols
+(
+#ifndef NDEBUG
+    int n_col,
+    RowInfo Row [],
+#endif
+    ColInfo Col [],
+    int A [],
+    int head [],
+    int row_start,
+    int row_length
+) ;
+
+PRIVATE int garbage_collection
+(
+    int n_row,
+    int n_col,
+    RowInfo Row [],
+    ColInfo Col [],
+    int A [],
+    int *pfree
+) ;
+
+PRIVATE int clear_mark
+(
+    int n_row,
+    RowInfo Row []
+) ;
+
+/* ========================================================================== */
+/* === Debugging definitions ================================================ */
+/* ========================================================================== */
+
+#ifndef NDEBUG
+
+/* === With debugging ======================================================= */
+
+/* stdlib.h: for getenv and atoi, to get debugging level from environment */
+#include <stdlib.h>
+
+/* stdio.h:  for printf (no printing if debugging is turned off) */
+#include <stdio.h>
+
+PRIVATE void debug_deg_lists
+(
+    int n_row,
+    int n_col,
+    RowInfo Row [],
+    ColInfo Col [],
+    int head [],
+    int min_score,
+    int should,
+    int max_deg
+) ;
+
+PRIVATE void debug_mark
+(
+    int n_row,
+    RowInfo Row [],
+    int tag_mark,
+    int max_mark
+) ;
+
+PRIVATE void debug_matrix
+(
+    int n_row,
+    int n_col,
+    RowInfo Row [],
+    ColInfo Col [],
+    int A []
+) ;
+
+PRIVATE void debug_structures
+(
+    int n_row,
+    int n_col,
+    RowInfo Row [],
+    ColInfo Col [],
+    int A [],
+    int n_col2
+) ;
+
+/* the following is the *ONLY* global variable in this file, and is only */
+/* present when debugging */
+
+PRIVATE int debug_colamd ;	/* debug print level */
+
+#define DEBUG0(params) { (void) printf params ; }
+#define DEBUG1(params) { if (debug_colamd >= 1) (void) printf params ; }
+#define DEBUG2(params) { if (debug_colamd >= 2) (void) printf params ; }
+#define DEBUG3(params) { if (debug_colamd >= 3) (void) printf params ; }
+#define DEBUG4(params) { if (debug_colamd >= 4) (void) printf params ; }
+
+#else
+
+/* === No debugging ========================================================= */
+
+#define DEBUG0(params) ;
+#define DEBUG1(params) ;
+#define DEBUG2(params) ;
+#define DEBUG3(params) ;
+#define DEBUG4(params) ;
+
+#endif
+
+/* ========================================================================== */
+
+
+/* ========================================================================== */
+/* === USER-CALLABLE ROUTINES: ============================================== */
+/* ========================================================================== */
+
+
+/* ========================================================================== */
+/* === colamd_recommended =================================================== */
+/* ========================================================================== */
+
+/*
+    The colamd_recommended routine returns the suggested size for Alen.  This
+    value has been determined to provide good balance between the number of
+    garbage collections and the memory requirements for colamd.
+*/
+
+PUBLIC int colamd_recommended	/* returns recommended value of Alen. */
+(
+    /* === Parameters ======================================================= */
+
+    int nnz,			/* number of nonzeros in A */
+    int n_row,			/* number of rows in A */
+    int n_col			/* number of columns in A */
+)
+{
+    /* === Local variables ================================================== */
+
+    int minimum ;		/* bare minimum requirements */
+    int recommended ;		/* recommended value of Alen */
+
+    if (nnz < 0 || n_row < 0 || n_col < 0)
+    {
+	/* return -1 if any input argument is corrupted */
+	DEBUG0 (("colamd_recommended error!")) ;
+	DEBUG0 ((" nnz: %d, n_row: %d, n_col: %d\n", nnz, n_row, n_col)) ;
+	return (-1) ;
+    }
+
+    minimum =
+	2 * (nnz)		/* for A */
+	+ (((n_col) + 1) * sizeof (ColInfo) / sizeof (int))	/* for Col */
+	+ (((n_row) + 1) * sizeof (RowInfo) / sizeof (int))	/* for Row */
+	+ n_col			/* minimum elbow room to guarrantee success */
+	+ COLAMD_STATS ;	/* for output statistics */
+
+    /* recommended is equal to the minumum plus enough memory to keep the */
+    /* number garbage collections low */
+    recommended = minimum + nnz/5 ;
+
+    return (recommended) ;
+}
+
+
+/* ========================================================================== */
+/* === colamd_set_defaults ================================================== */
+/* ========================================================================== */
+
+/*
+    The colamd_set_defaults routine sets the default values of the user-
+    controllable parameters for colamd:
+
+	knobs [0]	rows with knobs[0]*n_col entries or more are removed
+			prior to ordering.
+
+	knobs [1]	columns with knobs[1]*n_row entries or more are removed
+			prior to ordering, and placed last in the column
+			permutation.
+
+	knobs [2..19]	unused, but future versions might use this
+*/
+
+PUBLIC void colamd_set_defaults
+(
+    /* === Parameters ======================================================= */
+
+    double knobs [COLAMD_KNOBS]		/* knob array */
+)
+{
+    /* === Local variables ================================================== */
+
+    int i ;
+
+    if (!knobs)
+    {
+	return ;			/* no knobs to initialize */
+    }
+    for (i = 0 ; i < COLAMD_KNOBS ; i++)
+    {
+	knobs [i] = 0 ;
+    }
+    knobs [COLAMD_DENSE_ROW] = 0.5 ;	/* ignore rows over 50% dense */
+    knobs [COLAMD_DENSE_COL] = 0.5 ;	/* ignore columns over 50% dense */
+}
+
+
+/* ========================================================================== */
+/* === colamd =============================================================== */
+/* ========================================================================== */
+
+/*
+    The colamd routine computes a column ordering Q of a sparse matrix
+    A such that the LU factorization P(AQ) = LU remains sparse, where P is
+    selected via partial pivoting.   The routine can also be viewed as
+    providing a permutation Q such that the Cholesky factorization
+    (AQ)'(AQ) = LL' remains sparse.
+
+    On input, the nonzero patterns of the columns of A are stored in the
+    array A, in order 0 to n_col-1.  A is held in 0-based form (rows in the
+    range 0 to n_row-1 and columns in the range 0 to n_col-1).  Row indices
+    for column c are located in A [(p [c]) ... (p [c+1]-1)], where p [0] = 0,
+    and thus p [n_col] is the number of entries in A.  The matrix is
+    destroyed on output.  The row indices within each column do not have to
+    be sorted (from small to large row indices), and duplicate row indices
+    may be present.  However, colamd will work a little faster if columns are
+    sorted and no duplicates are present.  Matlab 5.2 always passes the matrix
+    with sorted columns, and no duplicates.
+
+    The integer array A is of size Alen.  Alen must be at least of size
+    (where nnz is the number of entries in A):
+
+	nnz			for the input column form of A
+	+ nnz			for a row form of A that colamd generates
+	+ 6*(n_col+1)		for a ColInfo Col [0..n_col] array
+				(this assumes sizeof (ColInfo) is 6 int's).
+	+ 4*(n_row+1)		for a RowInfo Row [0..n_row] array
+				(this assumes sizeof (RowInfo) is 4 int's).
+	+ elbow_room		must be at least n_col.  We recommend at least
+				nnz/5 in addition to that.  If sufficient,
+				changes in the elbow room affect the ordering
+				time only, not the ordering itself.
+	+ COLAMD_STATS		for the output statistics
+
+    Colamd returns FALSE is memory is insufficient, or TRUE otherwise.
+
+    On input, the caller must specify:
+
+	n_row			the number of rows of A
+	n_col			the number of columns of A
+	Alen			the size of the array A
+	A [0 ... nnz-1]		the row indices, where nnz = p [n_col]
+	A [nnz ... Alen-1]	(need not be initialized by the user)
+	p [0 ... n_col]		the column pointers,  p [0] = 0, and p [n_col]
+				is the number of entries in A.  Column c of A
+				is stored in A [p [c] ... p [c+1]-1].
+	knobs [0 ... 19]	a set of parameters that control the behavior
+				of colamd.  If knobs is a NULL pointer the
+				defaults are used.  The user-callable
+				colamd_set_defaults routine sets the default
+				parameters.  See that routine for a description
+				of the user-controllable parameters.
+
+    If the return value of Colamd is TRUE, then on output:
+
+	p [0 ... n_col-1]	the column permutation. p [0] is the first
+				column index, and p [n_col-1] is the last.
+				That is, p [k] = j means that column j of A
+				is the kth column of AQ.
+
+	A			is undefined on output (the matrix pattern is
+				destroyed), except for the following statistics:
+
+	A [0]			the number of dense (or empty) rows ignored
+	A [1]			the number of dense (or empty) columms.  These
+				are ordered last, in their natural order.
+	A [2]			the number of garbage collections performed.
+				If this is excessive, then you would have
+				gotten your results faster if Alen was larger.
+	A [3]			0, if all row indices in each column were in
+				sorted order and no duplicates were present.
+				1, if there were unsorted or duplicate row
+				indices in the input.  You would have gotten
+				your results faster if A [3] was returned as 0.
+
+    If the return value of Colamd is FALSE, then A and p are undefined on
+    output.
+*/
+
+PUBLIC int colamd		/* returns TRUE if successful */
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,			/* number of rows in A */
+    int n_col,			/* number of columns in A */
+    int Alen,			/* length of A */
+    int A [],			/* row indices of A */
+    int p [],			/* pointers to columns in A */
+    double knobs [COLAMD_KNOBS]	/* parameters (uses defaults if NULL) */
+)
+{
+    /* === Local variables ================================================== */
+
+    int i ;			/* loop index */
+    int nnz ;			/* nonzeros in A */
+    int Row_size ;		/* size of Row [], in integers */
+    int Col_size ;		/* size of Col [], in integers */
+    int elbow_room ;		/* remaining free space */
+    RowInfo *Row ;		/* pointer into A of Row [0..n_row] array */
+    ColInfo *Col ;		/* pointer into A of Col [0..n_col] array */
+    int n_col2 ;		/* number of non-dense, non-empty columns */
+    int n_row2 ;		/* number of non-dense, non-empty rows */
+    int ngarbage ;		/* number of garbage collections performed */
+    int max_deg ;		/* maximum row degree */
+    double default_knobs [COLAMD_KNOBS] ;	/* default knobs knobs array */
+    int init_result ;		/* return code from initialization */
+
+#ifndef NDEBUG
+    debug_colamd = 0 ;		/* no debug printing */
+    /* get "D" environment variable, which gives the debug printing level */
+    if (getenv ("D")) debug_colamd = atoi (getenv ("D")) ;
+    DEBUG0 (("debug version, D = %d (THIS WILL BE SLOOOOW!)\n", debug_colamd)) ;
+#endif
+
+    /* === Check the input arguments ======================================== */
+
+    if (n_row < 0 || n_col < 0 || !A || !p)
+    {
+	/* n_row and n_col must be non-negative, A and p must be present */
+	DEBUG0 (("colamd error! %d %d %d\n", n_row, n_col, Alen)) ;
+	return (FALSE) ;
+    }
+    nnz = p [n_col] ;
+    if (nnz < 0 || p [0] != 0)
+    {
+	/* nnz must be non-negative, and p [0] must be zero */
+	DEBUG0 (("colamd error! %d %d\n", nnz, p [0])) ;
+	return (FALSE) ;
+    }
+
+    /* === If no knobs, set default parameters ============================== */
+
+    if (!knobs)
+    {
+	knobs = default_knobs ;
+	colamd_set_defaults (knobs) ;
+    }
+
+    /* === Allocate the Row and Col arrays from array A ===================== */
+
+    Col_size = (n_col + 1) * sizeof (ColInfo) / sizeof (int) ;
+    Row_size = (n_row + 1) * sizeof (RowInfo) / sizeof (int) ;
+    elbow_room = Alen - (2*nnz + Col_size + Row_size) ;
+    if (elbow_room < n_col + COLAMD_STATS)
+    {
+	/* not enough space in array A to perform the ordering */
+	DEBUG0 (("colamd error! elbow_room %d, %d\n", elbow_room,n_col)) ;
+	return (FALSE) ;
+    }
+    Alen = 2*nnz + elbow_room ;
+    Col  = (ColInfo *) &A [Alen] ;
+    Row  = (RowInfo *) &A [Alen + Col_size] ;
+
+    /* === Construct the row and column data structures ===================== */
+
+    init_result = init_rows_cols (n_row, n_col, Row, Col, A, p) ;
+    if (init_result == -1)
+    {
+	/* input matrix is invalid */
+	DEBUG0 (("colamd error! matrix invalid\n")) ;
+	return (FALSE) ;
+    }
+
+    /* === Initialize scores, kill dense rows/columns ======================= */
+
+    init_scoring (n_row, n_col, Row, Col, A, p, knobs,
+	&n_row2, &n_col2, &max_deg) ;
+
+    /* === Order the supercolumns =========================================== */
+
+    ngarbage = find_ordering (n_row, n_col, Alen, Row, Col, A, p,
+	n_col2, max_deg, 2*nnz) ;
+
+    /* === Order the non-principal columns ================================== */
+
+    order_children (n_col, Col, p) ;
+
+    /* === Return statistics in A =========================================== */
+
+    for (i = 0 ; i < COLAMD_STATS ; i++)
+    {
+	A [i] = 0 ;
+    }
+    A [COLAMD_DENSE_ROW] = n_row - n_row2 ;
+    A [COLAMD_DENSE_COL] = n_col - n_col2 ;
+    A [COLAMD_DEFRAG_COUNT] = ngarbage ;
+    A [COLAMD_JUMBLED_COLS] = init_result ;
+
+    return (TRUE) ;
+}
+
+
+/* ========================================================================== */
+/* === NON-USER-CALLABLE ROUTINES: ========================================== */
+/* ========================================================================== */
+
+/* There are no user-callable routines beyond this point in the file */
+
+
+/* ========================================================================== */
+/* === init_rows_cols ======================================================= */
+/* ========================================================================== */
+
+/*
+    Takes the column form of the matrix in A and creates the row form of the
+    matrix.  Also, row and column attributes are stored in the Col and Row
+    structs.  If the columns are un-sorted or contain duplicate row indices,
+    this routine will also sort and remove duplicate row indices from the
+    column form of the matrix.  Returns -1 on error, 1 if columns jumbled,
+    or 0 if columns not jumbled.  Not user-callable.
+*/
+
+PRIVATE int init_rows_cols	/* returns status code */
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,			/* number of rows of A */
+    int n_col,			/* number of columns of A */
+    RowInfo Row [],		/* of size n_row+1 */
+    ColInfo Col [],		/* of size n_col+1 */
+    int A [],			/* row indices of A, of size Alen */
+    int p []			/* pointers to columns in A, of size n_col+1 */
+)
+{
+    /* === Local variables ================================================== */
+
+    int col ;			/* a column index */
+    int row ;			/* a row index */
+    int *cp ;			/* a column pointer */
+    int *cp_end ;		/* a pointer to the end of a column */
+    int *rp ;			/* a row pointer */
+    int *rp_end ;		/* a pointer to the end of a row */
+    int last_start ;		/* start index of previous column in A */
+    int start ;			/* start index of column in A */
+    int last_row ;		/* previous row */
+    int jumbled_columns ;	/* indicates if columns are jumbled */
+
+    /* === Initialize columns, and check column pointers ==================== */
+
+    last_start = 0 ;
+    for (col = 0 ; col < n_col ; col++)
+    {
+	start = p [col] ;
+	if (start < last_start)
+	{
+	    /* column pointers must be non-decreasing */
+	    DEBUG0 (("colamd error!  last p %d p [col] %d\n",last_start,start));
+	    return (-1) ;
+	}
+	Col [col].start = start ;
+	Col [col].length = p [col+1] - start ;
+	Col [col].shared1.thickness = 1 ;
+	Col [col].shared2.score = 0 ;
+	Col [col].shared3.prev = EMPTY ;
+	Col [col].shared4.degree_next = EMPTY ;
+	last_start = start ;
+    }
+    /* must check the end pointer for last column */
+    if (p [n_col] < last_start)
+    {
+	/* column pointers must be non-decreasing */
+	DEBUG0 (("colamd error!  last p %d p [n_col] %d\n",p[col],last_start)) ;
+	return (-1) ;
+    }
+
+    /* p [0..n_col] no longer needed, used as "head" in subsequent routines */
+
+    /* === Scan columns, compute row degrees, and check row indices ========= */
+
+    jumbled_columns = FALSE ;
+
+    for (row = 0 ; row < n_row ; row++)
+    {
+	Row [row].length = 0 ;
+	Row [row].shared2.mark = -1 ;
+    }
+
+    for (col = 0 ; col < n_col ; col++)
+    {
+	last_row = -1 ;
+
+	cp = &A [p [col]] ;
+	cp_end = &A [p [col+1]] ;
+
+	while (cp < cp_end)
+	{
+	    row = *cp++ ;
+
+	    /* make sure row indices within range */
+	    if (row < 0 || row >= n_row)
+	    {
+		DEBUG0 (("colamd error!  col %d row %d last_row %d\n",
+			 col, row, last_row)) ;
+		return (-1) ;
+	    }
+	    else if (row <= last_row)
+	    {
+		/* row indices are not sorted or repeated, thus cols */
+		/* are jumbled */
+		jumbled_columns = TRUE ;
+	    }
+	    /* prevent repeated row from being counted */
+	    if (Row [row].shared2.mark != col)
+	    {
+		Row [row].length++ ;
+		Row [row].shared2.mark = col ;
+		last_row = row ;
+	    }
+	    else
+	    {
+		/* this is a repeated entry in the column, */
+		/* it will be removed */
+		Col [col].length-- ;
+	    }
+	}
+    }
+
+    /* === Compute row pointers ============================================= */
+
+    /* row form of the matrix starts directly after the column */
+    /* form of matrix in A */
+    Row [0].start = p [n_col] ;
+    Row [0].shared1.p = Row [0].start ;
+    Row [0].shared2.mark = -1 ;
+    for (row = 1 ; row < n_row ; row++)
+    {
+	Row [row].start = Row [row-1].start + Row [row-1].length ;
+	Row [row].shared1.p = Row [row].start ;
+	Row [row].shared2.mark = -1 ;
+    }
+
+    /* === Create row form ================================================== */
+
+    if (jumbled_columns)
+    {
+	/* if cols jumbled, watch for repeated row indices */
+	for (col = 0 ; col < n_col ; col++)
+	{
+	    cp = &A [p [col]] ;
+	    cp_end = &A [p [col+1]] ;
+	    while (cp < cp_end)
+	    {
+		row = *cp++ ;
+		if (Row [row].shared2.mark != col)
+		{
+		    A [(Row [row].shared1.p)++] = col ;
+		    Row [row].shared2.mark = col ;
+		}
+	    }
+	}
+    }
+    else
+    {
+	/* if cols not jumbled, we don't need the mark (this is faster) */
+	for (col = 0 ; col < n_col ; col++)
+	{
+	    cp = &A [p [col]] ;
+	    cp_end = &A [p [col+1]] ;
+	    while (cp < cp_end)
+	    {
+		A [(Row [*cp++].shared1.p)++] = col ;
+	    }
+	}
+    }
+
+    /* === Clear the row marks and set row degrees ========================== */
+
+    for (row = 0 ; row < n_row ; row++)
+    {
+	Row [row].shared2.mark = 0 ;
+	Row [row].shared1.degree = Row [row].length ;
+    }
+
+    /* === See if we need to re-create columns ============================== */
+
+    if (jumbled_columns)
+    {
+
+#ifndef NDEBUG
+	/* make sure column lengths are correct */
+	for (col = 0 ; col < n_col ; col++)
+	{
+	    p [col] = Col [col].length ;
+	}
+	for (row = 0 ; row < n_row ; row++)
+	{
+	    rp = &A [Row [row].start] ;
+	    rp_end = rp + Row [row].length ;
+	    while (rp < rp_end)
+	    {
+		p [*rp++]-- ;
+	    }
+	}
+	for (col = 0 ; col < n_col ; col++)
+	{
+	    assert (p [col] == 0) ;
+	}
+	/* now p is all zero (different than when debugging is turned off) */
+#endif
+
+	/* === Compute col pointers ========================================= */
+
+	/* col form of the matrix starts at A [0]. */
+	/* Note, we may have a gap between the col form and the row */
+	/* form if there were duplicate entries, if so, it will be */
+	/* removed upon the first garbage collection */
+	Col [0].start = 0 ;
+	p [0] = Col [0].start ;
+	for (col = 1 ; col < n_col ; col++)
+	{
+	    /* note that the lengths here are for pruned columns, i.e. */
+	    /* no duplicate row indices will exist for these columns */
+	    Col [col].start = Col [col-1].start + Col [col-1].length ;
+	    p [col] = Col [col].start ;
+	}
+
+	/* === Re-create col form =========================================== */
+
+	for (row = 0 ; row < n_row ; row++)
+	{
+	    rp = &A [Row [row].start] ;
+	    rp_end = rp + Row [row].length ;
+	    while (rp < rp_end)
+	    {
+		A [(p [*rp++])++] = row ;
+	    }
+	}
+	return (1) ;
+    }
+    else
+    {
+	/* no columns jumbled (this is faster) */
+	return (0) ;
+    }
+}
+
+
+/* ========================================================================== */
+/* === init_scoring ========================================================= */
+/* ========================================================================== */
+
+/*
+    Kills dense or empty columns and rows, calculates an initial score for
+    each column, and places all columns in the degree lists.  Not user-callable.
+*/
+
+PRIVATE void init_scoring
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,			/* number of rows of A */
+    int n_col,			/* number of columns of A */
+    RowInfo Row [],		/* of size n_row+1 */
+    ColInfo Col [],		/* of size n_col+1 */
+    int A [],			/* column form and row form of A */
+    int head [],		/* of size n_col+1 */
+    double knobs [COLAMD_KNOBS],/* parameters */
+    int *p_n_row2,		/* number of non-dense, non-empty rows */
+    int *p_n_col2,		/* number of non-dense, non-empty columns */
+    int *p_max_deg		/* maximum row degree */
+)
+{
+    /* === Local variables ================================================== */
+
+    int c ;			/* a column index */
+    int r, row ;		/* a row index */
+    int *cp ;			/* a column pointer */
+    int deg ;			/* degree (# entries) of a row or column */
+    int *cp_end ;		/* a pointer to the end of a column */
+    int *new_cp ;		/* new column pointer */
+    int col_length ;		/* length of pruned column */
+    int score ;			/* current column score */
+    int n_col2 ;		/* number of non-dense, non-empty columns */
+    int n_row2 ;		/* number of non-dense, non-empty rows */
+    int dense_row_count ;	/* remove rows with more entries than this */
+    int dense_col_count ;	/* remove cols with more entries than this */
+    int min_score ;		/* smallest column score */
+    int max_deg ;		/* maximum row degree */
+    int next_col ;		/* Used to add to degree list.*/
+#ifndef NDEBUG
+    int debug_count ;		/* debug only. */
+#endif
+
+    /* === Extract knobs ==================================================== */
+
+    dense_row_count = MAX (0, MIN (knobs [COLAMD_DENSE_ROW] * n_col, n_col)) ;
+    dense_col_count = MAX (0, MIN (knobs [COLAMD_DENSE_COL] * n_row, n_row)) ;
+    DEBUG0 (("densecount: %d %d\n", dense_row_count, dense_col_count)) ;
+    max_deg = 0 ;
+    n_col2 = n_col ;
+    n_row2 = n_row ;
+
+    /* === Kill empty columns =============================================== */
+
+    /* Put the empty columns at the end in their natural, so that LU */
+    /* factorization can proceed as far as possible. */
+    for (c = n_col-1 ; c >= 0 ; c--)
+    {
+	deg = Col [c].length ;
+	if (deg == 0)
+	{
+	    /* this is a empty column, kill and order it last */
+	    Col [c].shared2.order = --n_col2 ;
+	    KILL_PRINCIPAL_COL (c) ;
+	}
+    }
+    DEBUG0 (("null columns killed: %d\n", n_col - n_col2)) ;
+
+    /* === Kill dense columns =============================================== */
+
+    /* Put the dense columns at the end, in their natural order */
+    for (c = n_col-1 ; c >= 0 ; c--)
+    {
+	/* skip any dead columns */
+	if (COL_IS_DEAD (c))
+	{
+	    continue ;
+	}
+	deg = Col [c].length ;
+	if (deg > dense_col_count)
+	{
+	    /* this is a dense column, kill and order it last */
+	    Col [c].shared2.order = --n_col2 ;
+	    /* decrement the row degrees */
+	    cp = &A [Col [c].start] ;
+	    cp_end = cp + Col [c].length ;
+	    while (cp < cp_end)
+	    {
+		Row [*cp++].shared1.degree-- ;
+	    }
+	    KILL_PRINCIPAL_COL (c) ;
+	}
+    }
+    DEBUG0 (("Dense and null columns killed: %d\n", n_col - n_col2)) ;
+
+    /* === Kill dense and empty rows ======================================== */
+
+    for (r = 0 ; r < n_row ; r++)
+    {
+	deg = Row [r].shared1.degree ;
+	assert (deg >= 0 && deg <= n_col) ;
+	if (deg > dense_row_count || deg == 0)
+	{
+	    /* kill a dense or empty row */
+	    KILL_ROW (r) ;
+	    --n_row2 ;
+	}
+	else
+	{
+	    /* keep track of max degree of remaining rows */
+	    max_deg = MAX (max_deg, deg) ;
+	}
+    }
+    DEBUG0 (("Dense and null rows killed: %d\n", n_row - n_row2)) ;
+
+    /* === Compute initial column scores ==================================== */
+
+    /* At this point the row degrees are accurate.  They reflect the number */
+    /* of "live" (non-dense) columns in each row.  No empty rows exist. */
+    /* Some "live" columns may contain only dead rows, however.  These are */
+    /* pruned in the code below. */
+
+    /* now find the initial matlab score for each column */
+    for (c = n_col-1 ; c >= 0 ; c--)
+    {
+	/* skip dead column */
+	if (COL_IS_DEAD (c))
+	{
+	    continue ;
+	}
+	score = 0 ;
+	cp = &A [Col [c].start] ;
+	new_cp = cp ;
+	cp_end = cp + Col [c].length ;
+	while (cp < cp_end)
+	{
+	    /* get a row */
+	    row = *cp++ ;
+	    /* skip if dead */
+	    if (ROW_IS_DEAD (row))
+	    {
+		continue ;
+	    }
+	    /* compact the column */
+	    *new_cp++ = row ;
+	    /* add row's external degree */
+	    score += Row [row].shared1.degree - 1 ;
+	    /* guard against integer overflow */
+	    score = MIN (score, n_col) ;
+	}
+	/* determine pruned column length */
+	col_length = (int) (new_cp - &A [Col [c].start]) ;
+	if (col_length == 0)
+	{
+	    /* a newly-made null column (all rows in this col are "dense" */
+	    /* and have already been killed) */
+	    DEBUG0 (("Newly null killed: %d\n", c)) ;
+	    Col [c].shared2.order = --n_col2 ;
+	    KILL_PRINCIPAL_COL (c) ;
+	}
+	else
+	{
+	    /* set column length and set score */
+	    assert (score >= 0) ;
+	    assert (score <= n_col) ;
+	    Col [c].length = col_length ;
+	    Col [c].shared2.score = score ;
+	}
+    }
+    DEBUG0 (("Dense, null, and newly-null columns killed: %d\n",n_col-n_col2)) ;
+
+    /* At this point, all empty rows and columns are dead.  All live columns */
+    /* are "clean" (containing no dead rows) and simplicial (no supercolumns */
+    /* yet).  Rows may contain dead columns, but all live rows contain at */
+    /* least one live column. */
+
+#ifndef NDEBUG
+    debug_structures (n_row, n_col, Row, Col, A, n_col2) ;
+#endif
+
+    /* === Initialize degree lists ========================================== */
+
+#ifndef NDEBUG
+    debug_count = 0 ;
+#endif
+
+    /* clear the hash buckets */
+    for (c = 0 ; c <= n_col ; c++)
+    {
+	head [c] = EMPTY ;
+    }
+    min_score = n_col ;
+    /* place in reverse order, so low column indices are at the front */
+    /* of the lists.  This is to encourage natural tie-breaking */
+    for (c = n_col-1 ; c >= 0 ; c--)
+    {
+	/* only add principal columns to degree lists */
+	if (COL_IS_ALIVE (c))
+	{
+	    DEBUG4 (("place %d score %d minscore %d ncol %d\n",
+		c, Col [c].shared2.score, min_score, n_col)) ;
+
+	    /* === Add columns score to DList =============================== */
+
+	    score = Col [c].shared2.score ;
+
+	    assert (min_score >= 0) ;
+	    assert (min_score <= n_col) ;
+	    assert (score >= 0) ;
+	    assert (score <= n_col) ;
+	    assert (head [score] >= EMPTY) ;
+
+	    /* now add this column to dList at proper score location */
+	    next_col = head [score] ;
+	    Col [c].shared3.prev = EMPTY ;
+	    Col [c].shared4.degree_next = next_col ;
+
+	    /* if there already was a column with the same score, set its */
+	    /* previous pointer to this new column */
+	    if (next_col != EMPTY)
+	    {
+		Col [next_col].shared3.prev = c ;
+	    }
+	    head [score] = c ;
+
+	    /* see if this score is less than current min */
+	    min_score = MIN (min_score, score) ;
+
+#ifndef NDEBUG
+	    debug_count++ ;
+#endif
+	}
+    }
+
+#ifndef NDEBUG
+    DEBUG0 (("Live cols %d out of %d, non-princ: %d\n",
+	debug_count, n_col, n_col-debug_count)) ;
+    assert (debug_count == n_col2) ;
+    debug_deg_lists (n_row, n_col, Row, Col, head, min_score, n_col2, max_deg) ;
+#endif
+
+    /* === Return number of remaining columns, and max row degree =========== */
+
+    *p_n_col2 = n_col2 ;
+    *p_n_row2 = n_row2 ;
+    *p_max_deg = max_deg ;
+}
+
+
+/* ========================================================================== */
+/* === find_ordering ======================================================== */
+/* ========================================================================== */
+
+/*
+    Order the principal columns of the supercolumn form of the matrix
+    (no supercolumns on input).  Uses a minimum approximate column minimum
+    degree ordering method.  Not user-callable.
+*/
+
+PRIVATE int find_ordering	/* return the number of garbage collections */
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,			/* number of rows of A */
+    int n_col,			/* number of columns of A */
+    int Alen,			/* size of A, 2*nnz + elbow_room or larger */
+    RowInfo Row [],		/* of size n_row+1 */
+    ColInfo Col [],		/* of size n_col+1 */
+    int A [],			/* column form and row form of A */
+    int head [],		/* of size n_col+1 */
+    int n_col2,			/* Remaining columns to order */
+    int max_deg,		/* Maximum row degree */
+    int pfree			/* index of first free slot (2*nnz on entry) */
+)
+{
+    /* === Local variables ================================================== */
+
+    int k ;			/* current pivot ordering step */
+    int pivot_col ;		/* current pivot column */
+    int *cp ;			/* a column pointer */
+    int *rp ;			/* a row pointer */
+    int pivot_row ;		/* current pivot row */
+    int *new_cp ;		/* modified column pointer */
+    int *new_rp ;		/* modified row pointer */
+    int pivot_row_start ;	/* pointer to start of pivot row */
+    int pivot_row_degree ;	/* # of columns in pivot row */
+    int pivot_row_length ;	/* # of supercolumns in pivot row */
+    int pivot_col_score ;	/* score of pivot column */
+    int needed_memory ;		/* free space needed for pivot row */
+    int *cp_end ;		/* pointer to the end of a column */
+    int *rp_end ;		/* pointer to the end of a row */
+    int row ;			/* a row index */
+    int col ;			/* a column index */
+    int max_score ;		/* maximum possible score */
+    int cur_score ;		/* score of current column */
+    unsigned int hash ;		/* hash value for supernode detection */
+    int head_column ;		/* head of hash bucket */
+    int first_col ;		/* first column in hash bucket */
+    int tag_mark ;		/* marker value for mark array */
+    int row_mark ;		/* Row [row].shared2.mark */
+    int set_difference ;	/* set difference size of row with pivot row */
+    int min_score ;		/* smallest column score */
+    int col_thickness ;		/* "thickness" (# of columns in a supercol) */
+    int max_mark ;		/* maximum value of tag_mark */
+    int pivot_col_thickness ;	/* number of columns represented by pivot col */
+    int prev_col ;		/* Used by Dlist operations. */
+    int next_col ;		/* Used by Dlist operations. */
+    int ngarbage ;		/* number of garbage collections performed */
+#ifndef NDEBUG
+    int debug_d ;		/* debug loop counter */
+    int debug_step = 0 ;	/* debug loop counter */
+#endif
+
+    /* === Initialization and clear mark ==================================== */
+
+    max_mark = INT_MAX - n_col ;	/* INT_MAX defined in <limits.h> */
+    tag_mark = clear_mark (n_row, Row) ;
+    min_score = 0 ;
+    ngarbage = 0 ;
+    DEBUG0 (("Ordering.. n_col2=%d\n", n_col2)) ;
+
+    /* === Order the columns ================================================ */
+
+    for (k = 0 ; k < n_col2 ; /* 'k' is incremented below */)
+    {
+
+#ifndef NDEBUG
+	if (debug_step % 100 == 0)
+	{
+	    DEBUG0 (("\n...       Step k: %d out of n_col2: %d\n", k, n_col2)) ;
+	}
+	else
+	{
+	    DEBUG1 (("\n----------Step k: %d out of n_col2: %d\n", k, n_col2)) ;
+	}
+	debug_step++ ;
+	debug_deg_lists (n_row, n_col, Row, Col, head,
+		min_score, n_col2-k, max_deg) ;
+	debug_matrix (n_row, n_col, Row, Col, A) ;
+#endif
+
+	/* === Select pivot column, and order it ============================ */
+
+	/* make sure degree list isn't empty */
+	assert (min_score >= 0) ;
+	assert (min_score <= n_col) ;
+	assert (head [min_score] >= EMPTY) ;
+
+#ifndef NDEBUG
+	for (debug_d = 0 ; debug_d < min_score ; debug_d++)
+	{
+	    assert (head [debug_d] == EMPTY) ;
+	}
+#endif
+
+	/* get pivot column from head of minimum degree list */
+	while (head [min_score] == EMPTY && min_score < n_col)
+	{
+	    min_score++ ;
+	}
+	pivot_col = head [min_score] ;
+	assert (pivot_col >= 0 && pivot_col <= n_col) ;
+	next_col = Col [pivot_col].shared4.degree_next ;
+	head [min_score] = next_col ;
+	if (next_col != EMPTY)
+	{
+	    Col [next_col].shared3.prev = EMPTY ;
+	}
+
+	assert (COL_IS_ALIVE (pivot_col)) ;
+	DEBUG3 (("Pivot col: %d\n", pivot_col)) ;
+
+	/* remember score for defrag check */
+	pivot_col_score = Col [pivot_col].shared2.score ;
+
+	/* the pivot column is the kth column in the pivot order */
+	Col [pivot_col].shared2.order = k ;
+
+	/* increment order count by column thickness */
+	pivot_col_thickness = Col [pivot_col].shared1.thickness ;
+	k += pivot_col_thickness ;
+	assert (pivot_col_thickness > 0) ;
+
+	/* === Garbage_collection, if necessary ============================= */
+
+	needed_memory = MIN (pivot_col_score, n_col - k) ;
+	if (pfree + needed_memory >= Alen)
+	{
+	    pfree = garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ;
+	    ngarbage++ ;
+	    /* after garbage collection we will have enough */
+	    assert (pfree + needed_memory < Alen) ;
+	    /* garbage collection has wiped out the Row[].shared2.mark array */
+	    tag_mark = clear_mark (n_row, Row) ;
+#ifndef NDEBUG
+	    debug_matrix (n_row, n_col, Row, Col, A) ;
+#endif
+	}
+
+	/* === Compute pivot row pattern ==================================== */
+
+	/* get starting location for this new merged row */
+	pivot_row_start = pfree ;
+
+	/* initialize new row counts to zero */
+	pivot_row_degree = 0 ;
+
+	/* tag pivot column as having been visited so it isn't included */
+	/* in merged pivot row */
+	Col [pivot_col].shared1.thickness = -pivot_col_thickness ;
+
+	/* pivot row is the union of all rows in the pivot column pattern */
+	cp = &A [Col [pivot_col].start] ;
+	cp_end = cp + Col [pivot_col].length ;
+	while (cp < cp_end)
+	{
+	    /* get a row */
+	    row = *cp++ ;
+	    DEBUG4 (("Pivot col pattern %d %d\n", ROW_IS_ALIVE (row), row)) ;
+	    /* skip if row is dead */
+	    if (ROW_IS_DEAD (row))
+	    {
+		continue ;
+	    }
+	    rp = &A [Row [row].start] ;
+	    rp_end = rp + Row [row].length ;
+	    while (rp < rp_end)
+	    {
+		/* get a column */
+		col = *rp++ ;
+		/* add the column, if alive and untagged */
+		col_thickness = Col [col].shared1.thickness ;
+		if (col_thickness > 0 && COL_IS_ALIVE (col))
+		{
+		    /* tag column in pivot row */
+		    Col [col].shared1.thickness = -col_thickness ;
+		    assert (pfree < Alen) ;
+		    /* place column in pivot row */
+		    A [pfree++] = col ;
+		    pivot_row_degree += col_thickness ;
+		}
+	    }
+	}
+
+	/* clear tag on pivot column */
+	Col [pivot_col].shared1.thickness = pivot_col_thickness ;
+	max_deg = MAX (max_deg, pivot_row_degree) ;
+
+#ifndef NDEBUG
+	DEBUG3 (("check2\n")) ;
+	debug_mark (n_row, Row, tag_mark, max_mark) ;
+#endif
+
+	/* === Kill all rows used to construct pivot row ==================== */
+
+	/* also kill pivot row, temporarily */
+	cp = &A [Col [pivot_col].start] ;
+	cp_end = cp + Col [pivot_col].length ;
+	while (cp < cp_end)
+	{
+	    /* may be killing an already dead row */
+	    row = *cp++ ;
+	    DEBUG2 (("Kill row in pivot col: %d\n", row)) ;
+	    KILL_ROW (row) ;
+	}
+
+	/* === Select a row index to use as the new pivot row =============== */
+
+	pivot_row_length = pfree - pivot_row_start ;
+	if (pivot_row_length > 0)
+	{
+	    /* pick the "pivot" row arbitrarily (first row in col) */
+	    pivot_row = A [Col [pivot_col].start] ;
+	    DEBUG2 (("Pivotal row is %d\n", pivot_row)) ;
+	}
+	else
+	{
+	    /* there is no pivot row, since it is of zero length */
+	    pivot_row = EMPTY ;
+	    assert (pivot_row_length == 0) ;
+	}
+	assert (Col [pivot_col].length > 0 || pivot_row_length == 0) ;
+
+	/* === Approximate degree computation =============================== */
+
+	/* Here begins the computation of the approximate degree.  The column */
+	/* score is the sum of the pivot row "length", plus the size of the */
+	/* set differences of each row in the column minus the pattern of the */
+	/* pivot row itself.  The column ("thickness") itself is also */
+	/* excluded from the column score (we thus use an approximate */
+	/* external degree). */
+
+	/* The time taken by the following code (compute set differences, and */
+	/* add them up) is proportional to the size of the data structure */
+	/* being scanned - that is, the sum of the sizes of each column in */
+	/* the pivot row.  Thus, the amortized time to compute a column score */
+	/* is proportional to the size of that column (where size, in this */
+	/* context, is the column "length", or the number of row indices */
+	/* in that column).  The number of row indices in a column is */
+	/* monotonically non-decreasing, from the length of the original */
+	/* column on input to colamd. */
+
+	/* === Compute set differences ====================================== */
+
+	DEBUG1 (("** Computing set differences phase. **\n")) ;
+
+	/* pivot row is currently dead - it will be revived later. */
+
+	DEBUG2 (("Pivot row: ")) ;
+	/* for each column in pivot row */
+	rp = &A [pivot_row_start] ;
+	rp_end = rp + pivot_row_length ;
+	while (rp < rp_end)
+	{
+	    col = *rp++ ;
+	    assert (COL_IS_ALIVE (col) && col != pivot_col) ;
+	    DEBUG2 (("Col: %d\n", col)) ;
+
+	    /* clear tags used to construct pivot row pattern */
+	    col_thickness = -Col [col].shared1.thickness ;
+	    assert (col_thickness > 0) ;
+	    Col [col].shared1.thickness = col_thickness ;
+
+	    /* === Remove column from degree list =========================== */
+
+	    cur_score = Col [col].shared2.score ;
+	    prev_col = Col [col].shared3.prev ;
+	    next_col = Col [col].shared4.degree_next ;
+	    assert (cur_score >= 0) ;
+	    assert (cur_score <= n_col) ;
+	    assert (cur_score >= EMPTY) ;
+	    if (prev_col == EMPTY)
+	    {
+		head [cur_score] = next_col ;
+	    }
+	    else
+	    {
+		Col [prev_col].shared4.degree_next = next_col ;
+	    }
+	    if (next_col != EMPTY)
+	    {
+		Col [next_col].shared3.prev = prev_col ;
+	    }
+
+	    /* === Scan the column ========================================== */
+
+	    cp = &A [Col [col].start] ;
+	    cp_end = cp + Col [col].length ;
+	    while (cp < cp_end)
+	    {
+		/* get a row */
+		row = *cp++ ;
+		row_mark = Row [row].shared2.mark ;
+		/* skip if dead */
+		if (ROW_IS_MARKED_DEAD (row_mark))
+		{
+		    continue ;
+		}
+		assert (row != pivot_row) ;
+		set_difference = row_mark - tag_mark ;
+		/* check if the row has been seen yet */
+		if (set_difference < 0)
+		{
+		    assert (Row [row].shared1.degree <= max_deg) ;
+		    set_difference = Row [row].shared1.degree ;
+		}
+		/* subtract column thickness from this row's set difference */
+		set_difference -= col_thickness ;
+		assert (set_difference >= 0) ;
+		/* absorb this row if the set difference becomes zero */
+		if (set_difference == 0)
+		{
+		    DEBUG1 (("aggressive absorption. Row: %d\n", row)) ;
+		    KILL_ROW (row) ;
+		}
+		else
+		{
+		    /* save the new mark */
+		    Row [row].shared2.mark = set_difference + tag_mark ;
+		}
+	    }
+	}
+
+#ifndef NDEBUG
+	debug_deg_lists (n_row, n_col, Row, Col, head,
+		min_score, n_col2-k-pivot_row_degree, max_deg) ;
+#endif
+
+	/* === Add up set differences for each column ======================= */
+
+	DEBUG1 (("** Adding set differences phase. **\n")) ;
+
+	/* for each column in pivot row */
+	rp = &A [pivot_row_start] ;
+	rp_end = rp + pivot_row_length ;
+	while (rp < rp_end)
+	{
+	    /* get a column */
+	    col = *rp++ ;
+	    assert (COL_IS_ALIVE (col) && col != pivot_col) ;
+	    hash = 0 ;
+	    cur_score = 0 ;
+	    cp = &A [Col [col].start] ;
+	    /* compact the column */
+	    new_cp = cp ;
+	    cp_end = cp + Col [col].length ;
+
+	    DEBUG2 (("Adding set diffs for Col: %d.\n", col)) ;
+
+	    while (cp < cp_end)
+	    {
+		/* get a row */
+		row = *cp++ ;
+		assert(row >= 0 && row < n_row) ;
+		row_mark = Row [row].shared2.mark ;
+		/* skip if dead */
+		if (ROW_IS_MARKED_DEAD (row_mark))
+		{
+		    continue ;
+		}
+		assert (row_mark > tag_mark) ;
+		/* compact the column */
+		*new_cp++ = row ;
+		/* compute hash function */
+		hash += row ;
+		/* add set difference */
+		cur_score += row_mark - tag_mark ;
+		/* integer overflow... */
+		cur_score = MIN (cur_score, n_col) ;
+	    }
+
+	    /* recompute the column's length */
+	    Col [col].length = (int) (new_cp - &A [Col [col].start]) ;
+
+	    /* === Further mass elimination ================================= */
+
+	    if (Col [col].length == 0)
+	    {
+		DEBUG1 (("further mass elimination. Col: %d\n", col)) ;
+		/* nothing left but the pivot row in this column */
+		KILL_PRINCIPAL_COL (col) ;
+		pivot_row_degree -= Col [col].shared1.thickness ;
+		assert (pivot_row_degree >= 0) ;
+		/* order it */
+		Col [col].shared2.order = k ;
+		/* increment order count by column thickness */
+		k += Col [col].shared1.thickness ;
+	    }
+	    else
+	    {
+		/* === Prepare for supercolumn detection ==================== */
+
+		DEBUG2 (("Preparing supercol detection for Col: %d.\n", col)) ;
+
+		/* save score so far */
+		Col [col].shared2.score = cur_score ;
+
+		/* add column to hash table, for supercolumn detection */
+		hash %= n_col + 1 ;
+
+		DEBUG2 ((" Hash = %d, n_col = %d.\n", hash, n_col)) ;
+		assert (hash <= n_col) ;
+
+		head_column = head [hash] ;
+		if (head_column > EMPTY)
+		{
+		    /* degree list "hash" is non-empty, use prev (shared3) of */
+		    /* first column in degree list as head of hash bucket */
+		    first_col = Col [head_column].shared3.headhash ;
+		    Col [head_column].shared3.headhash = col ;
+		}
+		else
+		{
+		    /* degree list "hash" is empty, use head as hash bucket */
+		    first_col = - (head_column + 2) ;
+		    head [hash] = - (col + 2) ;
+		}
+		Col [col].shared4.hash_next = first_col ;
+
+		/* save hash function in Col [col].shared3.hash */
+		Col [col].shared3.hash = (int) hash ;
+		assert (COL_IS_ALIVE (col)) ;
+	    }
+	}
+
+	/* The approximate external column degree is now computed.  */
+
+	/* === Supercolumn detection ======================================== */
+
+	DEBUG1 (("** Supercolumn detection phase. **\n")) ;
+
+	detect_super_cols (
+#ifndef NDEBUG
+		n_col, Row,
+#endif
+		Col, A, head, pivot_row_start, pivot_row_length) ;
+
+	/* === Kill the pivotal column ====================================== */
+
+	KILL_PRINCIPAL_COL (pivot_col) ;
+
+	/* === Clear mark =================================================== */
+
+	tag_mark += (max_deg + 1) ;
+	if (tag_mark >= max_mark)
+	{
+	    DEBUG1 (("clearing tag_mark\n")) ;
+	    tag_mark = clear_mark (n_row, Row) ;
+	}
+#ifndef NDEBUG
+	DEBUG3 (("check3\n")) ;
+	debug_mark (n_row, Row, tag_mark, max_mark) ;
+#endif
+
+	/* === Finalize the new pivot row, and column scores ================ */
+
+	DEBUG1 (("** Finalize scores phase. **\n")) ;
+
+	/* for each column in pivot row */
+	rp = &A [pivot_row_start] ;
+	/* compact the pivot row */
+	new_rp = rp ;
+	rp_end = rp + pivot_row_length ;
+	while (rp < rp_end)
+	{
+	    col = *rp++ ;
+	    /* skip dead columns */
+	    if (COL_IS_DEAD (col))
+	    {
+		continue ;
+	    }
+	    *new_rp++ = col ;
+	    /* add new pivot row to column */
+	    A [Col [col].start + (Col [col].length++)] = pivot_row ;
+
+	    /* retrieve score so far and add on pivot row's degree. */
+	    /* (we wait until here for this in case the pivot */
+	    /* row's degree was reduced due to mass elimination). */
+	    cur_score = Col [col].shared2.score + pivot_row_degree ;
+
+	    /* calculate the max possible score as the number of */
+	    /* external columns minus the 'k' value minus the */
+	    /* columns thickness */
+	    max_score = n_col - k - Col [col].shared1.thickness ;
+
+	    /* make the score the external degree of the union-of-rows */
+	    cur_score -= Col [col].shared1.thickness ;
+
+	    /* make sure score is less or equal than the max score */
+	    cur_score = MIN (cur_score, max_score) ;
+	    assert (cur_score >= 0) ;
+
+	    /* store updated score */
+	    Col [col].shared2.score = cur_score ;
+
+	    /* === Place column back in degree list ========================= */
+
+	    assert (min_score >= 0) ;
+	    assert (min_score <= n_col) ;
+	    assert (cur_score >= 0) ;
+	    assert (cur_score <= n_col) ;
+	    assert (head [cur_score] >= EMPTY) ;
+	    next_col = head [cur_score] ;
+	    Col [col].shared4.degree_next = next_col ;
+	    Col [col].shared3.prev = EMPTY ;
+	    if (next_col != EMPTY)
+	    {
+		Col [next_col].shared3.prev = col ;
+	    }
+	    head [cur_score] = col ;
+
+	    /* see if this score is less than current min */
+	    min_score = MIN (min_score, cur_score) ;
+
+	}
+
+#ifndef NDEBUG
+	debug_deg_lists (n_row, n_col, Row, Col, head,
+		min_score, n_col2-k, max_deg) ;
+#endif
+
+	/* === Resurrect the new pivot row ================================== */
+
+	if (pivot_row_degree > 0)
+	{
+	    /* update pivot row length to reflect any cols that were killed */
+	    /* during super-col detection and mass elimination */
+	    Row [pivot_row].start  = pivot_row_start ;
+	    Row [pivot_row].length = (int) (new_rp - &A[pivot_row_start]) ;
+	    Row [pivot_row].shared1.degree = pivot_row_degree ;
+	    Row [pivot_row].shared2.mark = 0 ;
+	    /* pivot row is no longer dead */
+	}
+    }
+
+    /* === All principal columns have now been ordered ====================== */
+
+    return (ngarbage) ;
+}
+
+
+/* ========================================================================== */
+/* === order_children ======================================================= */
+/* ========================================================================== */
+
+/*
+    The find_ordering routine has ordered all of the principal columns (the
+    representatives of the supercolumns).  The non-principal columns have not
+    yet been ordered.  This routine orders those columns by walking up the
+    parent tree (a column is a child of the column which absorbed it).  The
+    final permutation vector is then placed in p [0 ... n_col-1], with p [0]
+    being the first column, and p [n_col-1] being the last.  It doesn't look
+    like it at first glance, but be assured that this routine takes time linear
+    in the number of columns.  Although not immediately obvious, the time
+    taken by this routine is O (n_col), that is, linear in the number of
+    columns.  Not user-callable.
+*/
+
+PRIVATE void order_children
+(
+    /* === Parameters ======================================================= */
+
+    int n_col,			/* number of columns of A */
+    ColInfo Col [],		/* of size n_col+1 */
+    int p []			/* p [0 ... n_col-1] is the column permutation*/
+)
+{
+    /* === Local variables ================================================== */
+
+    int i ;			/* loop counter for all columns */
+    int c ;			/* column index */
+    int parent ;		/* index of column's parent */
+    int order ;			/* column's order */
+
+    /* === Order each non-principal column ================================== */
+
+    for (i = 0 ; i < n_col ; i++)
+    {
+	/* find an un-ordered non-principal column */
+	assert (COL_IS_DEAD (i)) ;
+	if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == EMPTY)
+	{
+	    parent = i ;
+	    /* once found, find its principal parent */
+	    do
+	    {
+		parent = Col [parent].shared1.parent ;
+	    } while (!COL_IS_DEAD_PRINCIPAL (parent)) ;
+
+	    /* now, order all un-ordered non-principal columns along path */
+	    /* to this parent.  collapse tree at the same time */
+	    c = i ;
+	    /* get order of parent */
+	    order = Col [parent].shared2.order ;
+
+	    do
+	    {
+		assert (Col [c].shared2.order == EMPTY) ;
+
+		/* order this column */
+		Col [c].shared2.order = order++ ;
+		/* collaps tree */
+		Col [c].shared1.parent = parent ;
+
+		/* get immediate parent of this column */
+		c = Col [c].shared1.parent ;
+
+		/* continue until we hit an ordered column.  There are */
+		/* guarranteed not to be anymore unordered columns */
+		/* above an ordered column */
+	    } while (Col [c].shared2.order == EMPTY) ;
+
+	    /* re-order the super_col parent to largest order for this group */
+	    Col [parent].shared2.order = order ;
+	}
+    }
+
+    /* === Generate the permutation ========================================= */
+
+    for (c = 0 ; c < n_col ; c++)
+    {
+	p [Col [c].shared2.order] = c ;
+    }
+}
+
+
+/* ========================================================================== */
+/* === detect_super_cols ==================================================== */
+/* ========================================================================== */
+
+/*
+    Detects supercolumns by finding matches between columns in the hash buckets.
+    Check amongst columns in the set A [row_start ... row_start + row_length-1].
+    The columns under consideration are currently *not* in the degree lists,
+    and have already been placed in the hash buckets.
+
+    The hash bucket for columns whose hash function is equal to h is stored
+    as follows:
+
+	if head [h] is >= 0, then head [h] contains a degree list, so:
+
+		head [h] is the first column in degree bucket h.
+		Col [head [h]].headhash gives the first column in hash bucket h.
+
+	otherwise, the degree list is empty, and:
+
+		-(head [h] + 2) is the first column in hash bucket h.
+
+    For a column c in a hash bucket, Col [c].shared3.prev is NOT a "previous
+    column" pointer.  Col [c].shared3.hash is used instead as the hash number
+    for that column.  The value of Col [c].shared4.hash_next is the next column
+    in the same hash bucket.
+
+    Assuming no, or "few" hash collisions, the time taken by this routine is
+    linear in the sum of the sizes (lengths) of each column whose score has
+    just been computed in the approximate degree computation.
+    Not user-callable.
+*/
+
+PRIVATE void detect_super_cols
+(
+    /* === Parameters ======================================================= */
+
+#ifndef NDEBUG
+    /* these two parameters are only needed when debugging is enabled: */
+    int n_col,			/* number of columns of A */
+    RowInfo Row [],		/* of size n_row+1 */
+#endif
+    ColInfo Col [],		/* of size n_col+1 */
+    int A [],			/* row indices of A */
+    int head [],		/* head of degree lists and hash buckets */
+    int row_start,		/* pointer to set of columns to check */
+    int row_length		/* number of columns to check */
+)
+{
+    /* === Local variables ================================================== */
+
+    int hash ;			/* hash # for a column */
+    int *rp ;			/* pointer to a row */
+    int c ;			/* a column index */
+    int super_c ;		/* column index of the column to absorb into */
+    int *cp1 ;			/* column pointer for column super_c */
+    int *cp2 ;			/* column pointer for column c */
+    int length ;		/* length of column super_c */
+    int prev_c ;		/* column preceding c in hash bucket */
+    int i ;			/* loop counter */
+    int *rp_end ;		/* pointer to the end of the row */
+    int col ;			/* a column index in the row to check */
+    int head_column ;		/* first column in hash bucket or degree list */
+    int first_col ;		/* first column in hash bucket */
+
+    /* === Consider each column in the row ================================== */
+
+    rp = &A [row_start] ;
+    rp_end = rp + row_length ;
+    while (rp < rp_end)
+    {
+	col = *rp++ ;
+	if (COL_IS_DEAD (col))
+	{
+	    continue ;
+	}
+
+	/* get hash number for this column */
+	hash = Col [col].shared3.hash ;
+	assert (hash <= n_col) ;
+
+	/* === Get the first column in this hash bucket ===================== */
+
+	head_column = head [hash] ;
+	if (head_column > EMPTY)
+	{
+	    first_col = Col [head_column].shared3.headhash ;
+	}
+	else
+	{
+	    first_col = - (head_column + 2) ;
+	}
+
+	/* === Consider each column in the hash bucket ====================== */
+
+	for (super_c = first_col ; super_c != EMPTY ;
+	    super_c = Col [super_c].shared4.hash_next)
+	{
+	    assert (COL_IS_ALIVE (super_c)) ;
+	    assert (Col [super_c].shared3.hash == hash) ;
+	    length = Col [super_c].length ;
+
+	    /* prev_c is the column preceding column c in the hash bucket */
+	    prev_c = super_c ;
+
+	    /* === Compare super_c with all columns after it ================ */
+
+	    for (c = Col [super_c].shared4.hash_next ;
+		 c != EMPTY ; c = Col [c].shared4.hash_next)
+	    {
+		assert (c != super_c) ;
+		assert (COL_IS_ALIVE (c)) ;
+		assert (Col [c].shared3.hash == hash) ;
+
+		/* not identical if lengths or scores are different */
+		if (Col [c].length != length ||
+		    Col [c].shared2.score != Col [super_c].shared2.score)
+		{
+		    prev_c = c ;
+		    continue ;
+		}
+
+		/* compare the two columns */
+		cp1 = &A [Col [super_c].start] ;
+		cp2 = &A [Col [c].start] ;
+
+		for (i = 0 ; i < length ; i++)
+		{
+		    /* the columns are "clean" (no dead rows) */
+		    assert (ROW_IS_ALIVE (*cp1))  ;
+		    assert (ROW_IS_ALIVE (*cp2))  ;
+		    /* row indices will same order for both supercols, */
+		    /* no gather scatter nessasary */
+		    if (*cp1++ != *cp2++)
+		    {
+			break ;
+		    }
+		}
+
+		/* the two columns are different if the for-loop "broke" */
+		if (i != length)
+		{
+		    prev_c = c ;
+		    continue ;
+		}
+
+		/* === Got it!  two columns are identical =================== */
+
+		assert (Col [c].shared2.score == Col [super_c].shared2.score) ;
+
+		Col [super_c].shared1.thickness += Col [c].shared1.thickness ;
+		Col [c].shared1.parent = super_c ;
+		KILL_NON_PRINCIPAL_COL (c) ;
+		/* order c later, in order_children() */
+		Col [c].shared2.order = EMPTY ;
+		/* remove c from hash bucket */
+		Col [prev_c].shared4.hash_next = Col [c].shared4.hash_next ;
+	    }
+	}
+
+	/* === Empty this hash bucket ======================================= */
+
+	if (head_column > EMPTY)
+	{
+	    /* corresponding degree list "hash" is not empty */
+	    Col [head_column].shared3.headhash = EMPTY ;
+	}
+	else
+	{
+	    /* corresponding degree list "hash" is empty */
+	    head [hash] = EMPTY ;
+	}
+    }
+}
+
+
+/* ========================================================================== */
+/* === garbage_collection =================================================== */
+/* ========================================================================== */
+
+/*
+    Defragments and compacts columns and rows in the workspace A.  Used when
+    all avaliable memory has been used while performing row merging.  Returns
+    the index of the first free position in A, after garbage collection.  The
+    time taken by this routine is linear is the size of the array A, which is
+    itself linear in the number of nonzeros in the input matrix.
+    Not user-callable.
+*/
+
+PRIVATE int garbage_collection  /* returns the new value of pfree */
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,			/* number of rows */
+    int n_col,			/* number of columns */
+    RowInfo Row [],		/* row info */
+    ColInfo Col [],		/* column info */
+    int A [],			/* A [0 ... Alen-1] holds the matrix */
+    int *pfree			/* &A [0] ... pfree is in use */
+)
+{
+    /* === Local variables ================================================== */
+
+    int *psrc ;			/* source pointer */
+    int *pdest ;		/* destination pointer */
+    int j ;			/* counter */
+    int r ;			/* a row index */
+    int c ;			/* a column index */
+    int length ;		/* length of a row or column */
+
+#ifndef NDEBUG
+    int debug_rows ;
+    DEBUG0 (("Defrag..\n")) ;
+    for (psrc = &A[0] ; psrc < pfree ; psrc++) assert (*psrc >= 0) ;
+    debug_rows = 0 ;
+#endif
+
+    /* === Defragment the columns =========================================== */
+
+    pdest = &A[0] ;
+    for (c = 0 ; c < n_col ; c++)
+    {
+	if (COL_IS_ALIVE (c))
+	{
+	    psrc = &A [Col [c].start] ;
+
+	    /* move and compact the column */
+	    assert (pdest <= psrc) ;
+	    Col [c].start = (int) (pdest - &A [0]) ;
+	    length = Col [c].length ;
+	    for (j = 0 ; j < length ; j++)
+	    {
+		r = *psrc++ ;
+		if (ROW_IS_ALIVE (r))
+		{
+		    *pdest++ = r ;
+		}
+	    }
+	    Col [c].length = (int) (pdest - &A [Col [c].start]) ;
+	}
+    }
+
+    /* === Prepare to defragment the rows =================================== */
+
+    for (r = 0 ; r < n_row ; r++)
+    {
+	if (ROW_IS_ALIVE (r))
+	{
+	    if (Row [r].length == 0)
+	    {
+		/* this row is of zero length.  cannot compact it, so kill it */
+		DEBUG0 (("Defrag row kill\n")) ;
+		KILL_ROW (r) ;
+	    }
+	    else
+	    {
+		/* save first column index in Row [r].shared2.first_column */
+		psrc = &A [Row [r].start] ;
+		Row [r].shared2.first_column = *psrc ;
+		assert (ROW_IS_ALIVE (r)) ;
+		/* flag the start of the row with the one's complement of row */
+		*psrc = ONES_COMPLEMENT (r) ;
+#ifndef NDEBUG
+		debug_rows++ ;
+#endif
+	    }
+	}
+    }
+
+    /* === Defragment the rows ============================================== */
+
+    psrc = pdest ;
+    while (psrc < pfree)
+    {
+	/* find a negative number ... the start of a row */
+	if (*psrc++ < 0)
+	{
+	    psrc-- ;
+	    /* get the row index */
+	    r = ONES_COMPLEMENT (*psrc) ;
+	    assert (r >= 0 && r < n_row) ;
+	    /* restore first column index */
+	    *psrc = Row [r].shared2.first_column ;
+	    assert (ROW_IS_ALIVE (r)) ;
+
+	    /* move and compact the row */
+	    assert (pdest <= psrc) ;
+	    Row [r].start = (int) (pdest - &A [0]) ;
+	    length = Row [r].length ;
+	    for (j = 0 ; j < length ; j++)
+	    {
+		c = *psrc++ ;
+		if (COL_IS_ALIVE (c))
+		{
+		    *pdest++ = c ;
+		}
+	    }
+	    Row [r].length = (int) (pdest - &A [Row [r].start]) ;
+#ifndef NDEBUG
+	    debug_rows-- ;
+#endif
+	}
+    }
+    /* ensure we found all the rows */
+    assert (debug_rows == 0) ;
+
+    /* === Return the new value of pfree ==================================== */
+
+    return ((int) (pdest - &A [0])) ;
+}
+
+
+/* ========================================================================== */
+/* === clear_mark =========================================================== */
+/* ========================================================================== */
+
+/*
+    Clears the Row [].shared2.mark array, and returns the new tag_mark.
+    Return value is the new tag_mark.  Not user-callable.
+*/
+
+PRIVATE int clear_mark	/* return the new value for tag_mark */
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,		/* number of rows in A */
+    RowInfo Row []	/* Row [0 ... n_row-1].shared2.mark is set to zero */
+)
+{
+    /* === Local variables ================================================== */
+
+    int r ;
+
+    DEBUG0 (("Clear mark\n")) ;
+    for (r = 0 ; r < n_row ; r++)
+    {
+	if (ROW_IS_ALIVE (r))
+	{
+	    Row [r].shared2.mark = 0 ;
+	}
+    }
+    return (1) ;
+}
+
+
+/* ========================================================================== */
+/* === debugging routines =================================================== */
+/* ========================================================================== */
+
+/* When debugging is disabled, the remainder of this file is ignored. */
+
+#ifndef NDEBUG
+
+
+/* ========================================================================== */
+/* === debug_structures ===================================================== */
+/* ========================================================================== */
+
+/*
+    At this point, all empty rows and columns are dead.  All live columns
+    are "clean" (containing no dead rows) and simplicial (no supercolumns
+    yet).  Rows may contain dead columns, but all live rows contain at
+    least one live column.
+*/
+
+PRIVATE void debug_structures
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,
+    int n_col,
+    RowInfo Row [],
+    ColInfo Col [],
+    int A [],
+    int n_col2
+)
+{
+    /* === Local variables ================================================== */
+
+    int i ;
+    int c ;
+    int *cp ;
+    int *cp_end ;
+    int len ;
+    int score ;
+    int r ;
+    int *rp ;
+    int *rp_end ;
+    int deg ;
+
+    /* === Check A, Row, and Col ============================================ */
+
+    for (c = 0 ; c < n_col ; c++)
+    {
+	if (COL_IS_ALIVE (c))
+	{
+	    len = Col [c].length ;
+	    score = Col [c].shared2.score ;
+	    DEBUG4 (("initial live col %5d %5d %5d\n", c, len, score)) ;
+	    assert (len > 0) ;
+	    assert (score >= 0) ;
+	    assert (Col [c].shared1.thickness == 1) ;
+	    cp = &A [Col [c].start] ;
+	    cp_end = cp + len ;
+	    while (cp < cp_end)
+	    {
+		r = *cp++ ;
+		assert (ROW_IS_ALIVE (r)) ;
+	    }
+	}
+	else
+	{
+	    i = Col [c].shared2.order ;
+	    assert (i >= n_col2 && i < n_col) ;
+	}
+    }
+
+    for (r = 0 ; r < n_row ; r++)
+    {
+	if (ROW_IS_ALIVE (r))
+	{
+	    i = 0 ;
+	    len = Row [r].length ;
+	    deg = Row [r].shared1.degree ;
+	    assert (len > 0) ;
+	    assert (deg > 0) ;
+	    rp = &A [Row [r].start] ;
+	    rp_end = rp + len ;
+	    while (rp < rp_end)
+	    {
+		c = *rp++ ;
+		if (COL_IS_ALIVE (c))
+		{
+		    i++ ;
+		}
+	    }
+	    assert (i > 0) ;
+	}
+    }
+}
+
+
+/* ========================================================================== */
+/* === debug_deg_lists ====================================================== */
+/* ========================================================================== */
+
+/*
+    Prints the contents of the degree lists.  Counts the number of columns
+    in the degree list and compares it to the total it should have.  Also
+    checks the row degrees.
+*/
+
+PRIVATE void debug_deg_lists
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,
+    int n_col,
+    RowInfo Row [],
+    ColInfo Col [],
+    int head [],
+    int min_score,
+    int should,
+    int max_deg
+)
+{
+    /* === Local variables ================================================== */
+
+    int deg ;
+    int col ;
+    int have ;
+    int row ;
+
+    /* === Check the degree lists =========================================== */
+
+    if (n_col > 10000 && debug_colamd <= 0)
+    {
+	return ;
+    }
+    have = 0 ;
+    DEBUG4 (("Degree lists: %d\n", min_score)) ;
+    for (deg = 0 ; deg <= n_col ; deg++)
+    {
+	col = head [deg] ;
+	if (col == EMPTY)
+	{
+	    continue ;
+	}
+	DEBUG4 (("%d:", deg)) ;
+	while (col != EMPTY)
+	{
+	    DEBUG4 ((" %d", col)) ;
+	    have += Col [col].shared1.thickness ;
+	    assert (COL_IS_ALIVE (col)) ;
+	    col = Col [col].shared4.degree_next ;
+	}
+	DEBUG4 (("\n")) ;
+    }
+    DEBUG4 (("should %d have %d\n", should, have)) ;
+    assert (should == have) ;
+
+    /* === Check the row degrees ============================================ */
+
+    if (n_row > 10000 && debug_colamd <= 0)
+    {
+	return ;
+    }
+    for (row = 0 ; row < n_row ; row++)
+    {
+	if (ROW_IS_ALIVE (row))
+	{
+	    assert (Row [row].shared1.degree <= max_deg) ;
+	}
+    }
+}
+
+
+/* ========================================================================== */
+/* === debug_mark =========================================================== */
+/* ========================================================================== */
+
+/*
+    Ensures that the tag_mark is less that the maximum and also ensures that
+    each entry in the mark array is less than the tag mark.
+*/
+
+PRIVATE void debug_mark
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,
+    RowInfo Row [],
+    int tag_mark,
+    int max_mark
+)
+{
+    /* === Local variables ================================================== */
+
+    int r ;
+
+    /* === Check the Row marks ============================================== */
+
+    assert (tag_mark > 0 && tag_mark <= max_mark) ;
+    if (n_row > 10000 && debug_colamd <= 0)
+    {
+	return ;
+    }
+    for (r = 0 ; r < n_row ; r++)
+    {
+	assert (Row [r].shared2.mark < tag_mark) ;
+    }
+}
+
+
+/* ========================================================================== */
+/* === debug_matrix ========================================================= */
+/* ========================================================================== */
+
+/*
+    Prints out the contents of the columns and the rows.
+*/
+
+PRIVATE void debug_matrix
+(
+    /* === Parameters ======================================================= */
+
+    int n_row,
+    int n_col,
+    RowInfo Row [],
+    ColInfo Col [],
+    int A []
+)
+{
+    /* === Local variables ================================================== */
+
+    int r ;
+    int c ;
+    int *rp ;
+    int *rp_end ;
+    int *cp ;
+    int *cp_end ;
+
+    /* === Dump the rows and columns of the matrix ========================== */
+
+    if (debug_colamd < 3)
+    {
+	return ;
+    }
+    DEBUG3 (("DUMP MATRIX:\n")) ;
+    for (r = 0 ; r < n_row ; r++)
+    {
+	DEBUG3 (("Row %d alive? %d\n", r, ROW_IS_ALIVE (r))) ;
+	if (ROW_IS_DEAD (r))
+	{
+	    continue ;
+	}
+	DEBUG3 (("start %d length %d degree %d\n",
+		Row [r].start, Row [r].length, Row [r].shared1.degree)) ;
+	rp = &A [Row [r].start] ;
+	rp_end = rp + Row [r].length ;
+	while (rp < rp_end)
+	{
+	    c = *rp++ ;
+	    DEBUG3 (("	%d col %d\n", COL_IS_ALIVE (c), c)) ;
+	}
+    }
+
+    for (c = 0 ; c < n_col ; c++)
+    {
+	DEBUG3 (("Col %d alive? %d\n", c, COL_IS_ALIVE (c))) ;
+	if (COL_IS_DEAD (c))
+	{
+	    continue ;
+	}
+	DEBUG3 (("start %d length %d shared1 %d shared2 %d\n",
+		Col [c].start, Col [c].length,
+		Col [c].shared1.thickness, Col [c].shared2.score)) ;
+	cp = &A [Col [c].start] ;
+	cp_end = cp + Col [c].length ;
+	while (cp < cp_end)
+	{
+	    r = *cp++ ;
+	    DEBUG3 (("	%d row %d\n", ROW_IS_ALIVE (r), r)) ;
+	}
+    }
+}
+
+#endif
+
diff --git a/intern/opennl/superlu/colamd.h b/intern/opennl/superlu/colamd.h
new file mode 100644
index 00000000000..00783983b27
--- /dev/null
+++ b/intern/opennl/superlu/colamd.h
@@ -0,0 +1,67 @@
+/* ========================================================================== */
+/* === colamd prototypes and definitions ==================================== */
+/* ========================================================================== */
+
+/*
+    This is the colamd include file,
+
+	http://www.cise.ufl.edu/~davis/colamd/colamd.h
+
+    for use in the colamd.c, colamdmex.c, and symamdmex.c files located at
+
+	http://www.cise.ufl.edu/~davis/colamd/
+
+    See those files for a description of colamd and symamd, and for the
+    copyright notice, which also applies to this file.
+
+    August 3, 1998.  Version 1.0.
+*/
+
+/* ========================================================================== */
+/* === Definitions ========================================================== */
+/* ========================================================================== */
+
+/* size of the knobs [ ] array.  Only knobs [0..1] are currently used. */
+#define COLAMD_KNOBS 20
+
+/* number of output statistics.  Only A [0..2] are currently used. */
+#define COLAMD_STATS 20
+
+/* knobs [0] and A [0]: dense row knob and output statistic. */
+#define COLAMD_DENSE_ROW 0
+
+/* knobs [1] and A [1]: dense column knob and output statistic. */
+#define COLAMD_DENSE_COL 1
+
+/* A [2]: memory defragmentation count output statistic */
+#define COLAMD_DEFRAG_COUNT 2
+
+/* A [3]: whether or not the input columns were jumbled or had duplicates */
+#define COLAMD_JUMBLED_COLS 3
+
+/* ========================================================================== */
+/* === Prototypes of user-callable routines ================================= */
+/* ========================================================================== */
+
+int colamd_recommended		/* returns recommended value of Alen */
+(
+    int nnz,			/* nonzeros in A */
+    int n_row,			/* number of rows in A */
+    int n_col			/* number of columns in A */
+) ;
+
+void colamd_set_defaults	/* sets default parameters */
+(				/* knobs argument is modified on output */
+    double knobs [COLAMD_KNOBS]	/* parameter settings for colamd */
+) ;
+
+int colamd			/* returns TRUE if successful, FALSE otherwise*/
+(				/* A and p arguments are modified on output */
+    int n_row,			/* number of rows in A */
+    int n_col,			/* number of columns in A */
+    int Alen,			/* size of the array A */
+    int A [],			/* row indices of A, of size Alen */
+    int p [],			/* column pointers of A, of size n_col+1 */
+    double knobs [COLAMD_KNOBS]	/* parameter settings for colamd */
+) ;
+
diff --git a/intern/opennl/superlu/get_perm_c.c b/intern/opennl/superlu/get_perm_c.c
new file mode 100644
index 00000000000..9cdf5a876bf
--- /dev/null
+++ b/intern/opennl/superlu/get_perm_c.c
@@ -0,0 +1,453 @@
+/*
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ */
+
+#include "ssp_defs.h"
+#include "colamd.h"
+
+extern int  genmmd_(int *, int *, int *, int *, int *, int *, int *, 
+		    int *, int *, int *, int *, int *);
+
+void
+get_colamd(
+	   const int m,  /* number of rows in matrix A. */
+	   const int n,  /* number of columns in matrix A. */
+	   const int nnz,/* number of nonzeros in matrix A. */
+	   int *colptr,  /* column pointer of size n+1 for matrix A. */
+	   int *rowind,  /* row indices of size nz for matrix A. */
+	   int *perm_c   /* out - the column permutation vector. */
+	   )
+{
+    int Alen, *A, i, info, *p;
+    double *knobs;
+
+    Alen = colamd_recommended(nnz, m, n);
+
+    if ( !(knobs = (double *) SUPERLU_MALLOC(COLAMD_KNOBS * sizeof(double))) )
+        ABORT("Malloc fails for knobs");
+    colamd_set_defaults(knobs);
+
+    if (!(A = (int *) SUPERLU_MALLOC(Alen * sizeof(int))) )
+        ABORT("Malloc fails for A[]");
+    if (!(p = (int *) SUPERLU_MALLOC((n+1) * sizeof(int))) )
+        ABORT("Malloc fails for p[]");
+    for (i = 0; i <= n; ++i) p[i] = colptr[i];
+    for (i = 0; i < nnz; ++i) A[i] = rowind[i];
+    info = colamd(m, n, Alen, A, p, knobs);
+    if ( info == FALSE ) ABORT("COLAMD failed");
+
+    for (i = 0; i < n; ++i) perm_c[p[i]] = i;
+
+    SUPERLU_FREE(knobs);
+    SUPERLU_FREE(A);
+    SUPERLU_FREE(p);
+}
+
+void
+getata(
+       const int m,      /* number of rows in matrix A. */
+       const int n,      /* number of columns in matrix A. */
+       const int nz,     /* number of nonzeros in matrix A */
+       int *colptr,      /* column pointer of size n+1 for matrix A. */
+       int *rowind,      /* row indices of size nz for matrix A. */
+       int *atanz,       /* out - on exit, returns the actual number of
+                            nonzeros in matrix A'*A. */
+       int **ata_colptr, /* out - size n+1 */
+       int **ata_rowind  /* out - size *atanz */
+       )
+/*
+ * Purpose
+ * =======
+ *
+ * Form the structure of A'*A. A is an m-by-n matrix in column oriented
+ * format represented by (colptr, rowind). The output A'*A is in column
+ * oriented format (symmetrically, also row oriented), represented by
+ * (ata_colptr, ata_rowind).
+ *
+ * This routine is modified from GETATA routine by Tim Davis.
+ * The complexity of this algorithm is: SUM_{i=1,m} r(i)^2,
+ * i.e., the sum of the square of the row counts.
+ *
+ * Questions
+ * =========
+ *     o  Do I need to withhold the *dense* rows?
+ *     o  How do I know the number of nonzeros in A'*A?
+ * 
+ */
+{
+    register int i, j, k, col, num_nz, ti, trow;
+    int *marker, *b_colptr, *b_rowind;
+    int *t_colptr, *t_rowind; /* a column oriented form of T = A' */
+
+    if ( !(marker = (int*) SUPERLU_MALLOC((SUPERLU_MAX(m,n)+1)*sizeof(int))) )
+	ABORT("SUPERLU_MALLOC fails for marker[]");
+    if ( !(t_colptr = (int*) SUPERLU_MALLOC((m+1) * sizeof(int))) )
+	ABORT("SUPERLU_MALLOC t_colptr[]");
+    if ( !(t_rowind = (int*) SUPERLU_MALLOC(nz * sizeof(int))) )
+	ABORT("SUPERLU_MALLOC fails for t_rowind[]");
+
+    
+    /* Get counts of each column of T, and set up column pointers */
+    for (i = 0; i < m; ++i) marker[i] = 0;
+    for (j = 0; j < n; ++j) {
+	for (i = colptr[j]; i < colptr[j+1]; ++i)
+	    ++marker[rowind[i]];
+    }
+    t_colptr[0] = 0;
+    for (i = 0; i < m; ++i) {
+	t_colptr[i+1] = t_colptr[i] + marker[i];
+	marker[i] = t_colptr[i];
+    }
+
+    /* Transpose the matrix from A to T */
+    for (j = 0; j < n; ++j)
+	for (i = colptr[j]; i < colptr[j+1]; ++i) {
+	    col = rowind[i];
+	    t_rowind[marker[col]] = j;
+	    ++marker[col];
+	}
+
+    
+    /* ----------------------------------------------------------------
+       compute B = T * A, where column j of B is:
+
+       Struct (B_*j) =    UNION   ( Struct (T_*k) )
+                        A_kj != 0
+
+       do not include the diagonal entry
+   
+       ( Partition A as: A = (A_*1, ..., A_*n)
+         Then B = T * A = (T * A_*1, ..., T * A_*n), where
+         T * A_*j = (T_*1, ..., T_*m) * A_*j.  )
+       ---------------------------------------------------------------- */
+
+    /* Zero the diagonal flag */
+    for (i = 0; i < n; ++i) marker[i] = -1;
+
+    /* First pass determines number of nonzeros in B */
+    num_nz = 0;
+    for (j = 0; j < n; ++j) {
+	/* Flag the diagonal so it's not included in the B matrix */
+	marker[j] = j;
+
+	for (i = colptr[j]; i < colptr[j+1]; ++i) {
+	    /* A_kj is nonzero, add pattern of column T_*k to B_*j */
+	    k = rowind[i];
+	    for (ti = t_colptr[k]; ti < t_colptr[k+1]; ++ti) {
+		trow = t_rowind[ti];
+		if ( marker[trow] != j ) {
+		    marker[trow] = j;
+		    num_nz++;
+		}
+	    }
+	}
+    }
+    *atanz = num_nz;
+    
+    /* Allocate storage for A'*A */
+    if ( !(*ata_colptr = (int*) SUPERLU_MALLOC( (n+1) * sizeof(int)) ) )
+	ABORT("SUPERLU_MALLOC fails for ata_colptr[]");
+    if ( *atanz ) {
+	if ( !(*ata_rowind = (int*) SUPERLU_MALLOC( *atanz * sizeof(int)) ) )
+	    ABORT("SUPERLU_MALLOC fails for ata_rowind[]");
+    }
+    b_colptr = *ata_colptr; /* aliasing */
+    b_rowind = *ata_rowind;
+    
+    /* Zero the diagonal flag */
+    for (i = 0; i < n; ++i) marker[i] = -1;
+    
+    /* Compute each column of B, one at a time */
+    num_nz = 0;
+    for (j = 0; j < n; ++j) {
+	b_colptr[j] = num_nz;
+	
+	/* Flag the diagonal so it's not included in the B matrix */
+	marker[j] = j;
+
+	for (i = colptr[j]; i < colptr[j+1]; ++i) {
+	    /* A_kj is nonzero, add pattern of column T_*k to B_*j */
+	    k = rowind[i];
+	    for (ti = t_colptr[k]; ti < t_colptr[k+1]; ++ti) {
+		trow = t_rowind[ti];
+		if ( marker[trow] != j ) {
+		    marker[trow] = j;
+		    b_rowind[num_nz++] = trow;
+		}
+	    }
+	}
+    }
+    b_colptr[n] = num_nz;
+       
+    SUPERLU_FREE(marker);
+    SUPERLU_FREE(t_colptr);
+    SUPERLU_FREE(t_rowind);
+}
+
+
+void
+at_plus_a(
+	  const int n,      /* number of columns in matrix A. */
+	  const int nz,     /* number of nonzeros in matrix A */
+	  int *colptr,      /* column pointer of size n+1 for matrix A. */
+	  int *rowind,      /* row indices of size nz for matrix A. */
+	  int *bnz,         /* out - on exit, returns the actual number of
+                               nonzeros in matrix A'*A. */
+	  int **b_colptr,   /* out - size n+1 */
+	  int **b_rowind    /* out - size *bnz */
+	  )
+{
+/*
+ * Purpose
+ * =======
+ *
+ * Form the structure of A'+A. A is an n-by-n matrix in column oriented
+ * format represented by (colptr, rowind). The output A'+A is in column
+ * oriented format (symmetrically, also row oriented), represented by
+ * (b_colptr, b_rowind).
+ *
+ */
+    register int i, j, k, col, num_nz;
+    int *t_colptr, *t_rowind; /* a column oriented form of T = A' */
+    int *marker;
+
+    if ( !(marker = (int*) SUPERLU_MALLOC( n * sizeof(int)) ) )
+	ABORT("SUPERLU_MALLOC fails for marker[]");
+    if ( !(t_colptr = (int*) SUPERLU_MALLOC( (n+1) * sizeof(int)) ) )
+	ABORT("SUPERLU_MALLOC fails for t_colptr[]");
+    if ( !(t_rowind = (int*) SUPERLU_MALLOC( nz * sizeof(int)) ) )
+	ABORT("SUPERLU_MALLOC fails t_rowind[]");
+
+    
+    /* Get counts of each column of T, and set up column pointers */
+    for (i = 0; i < n; ++i) marker[i] = 0;
+    for (j = 0; j < n; ++j) {
+	for (i = colptr[j]; i < colptr[j+1]; ++i)
+	    ++marker[rowind[i]];
+    }
+    t_colptr[0] = 0;
+    for (i = 0; i < n; ++i) {
+	t_colptr[i+1] = t_colptr[i] + marker[i];
+	marker[i] = t_colptr[i];
+    }
+
+    /* Transpose the matrix from A to T */
+    for (j = 0; j < n; ++j)
+	for (i = colptr[j]; i < colptr[j+1]; ++i) {
+	    col = rowind[i];
+	    t_rowind[marker[col]] = j;
+	    ++marker[col];
+	}
+
+
+    /* ----------------------------------------------------------------
+       compute B = A + T, where column j of B is:
+
+       Struct (B_*j) = Struct (A_*k) UNION Struct (T_*k)
+
+       do not include the diagonal entry
+       ---------------------------------------------------------------- */
+
+    /* Zero the diagonal flag */
+    for (i = 0; i < n; ++i) marker[i] = -1;
+
+    /* First pass determines number of nonzeros in B */
+    num_nz = 0;
+    for (j = 0; j < n; ++j) {
+	/* Flag the diagonal so it's not included in the B matrix */
+	marker[j] = j;
+
+	/* Add pattern of column A_*k to B_*j */
+	for (i = colptr[j]; i < colptr[j+1]; ++i) {
+	    k = rowind[i];
+	    if ( marker[k] != j ) {
+		marker[k] = j;
+		++num_nz;
+	    }
+	}
+
+	/* Add pattern of column T_*k to B_*j */
+	for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) {
+	    k = t_rowind[i];
+	    if ( marker[k] != j ) {
+		marker[k] = j;
+		++num_nz;
+	    }
+	}
+    }
+    *bnz = num_nz;
+    
+    /* Allocate storage for A+A' */
+    if ( !(*b_colptr = (int*) SUPERLU_MALLOC( (n+1) * sizeof(int)) ) )
+	ABORT("SUPERLU_MALLOC fails for b_colptr[]");
+    if ( *bnz) {
+      if ( !(*b_rowind = (int*) SUPERLU_MALLOC( *bnz * sizeof(int)) ) )
+	ABORT("SUPERLU_MALLOC fails for b_rowind[]");
+    }
+    
+    /* Zero the diagonal flag */
+    for (i = 0; i < n; ++i) marker[i] = -1;
+    
+    /* Compute each column of B, one at a time */
+    num_nz = 0;
+    for (j = 0; j < n; ++j) {
+	(*b_colptr)[j] = num_nz;
+	
+	/* Flag the diagonal so it's not included in the B matrix */
+	marker[j] = j;
+
+	/* Add pattern of column A_*k to B_*j */
+	for (i = colptr[j]; i < colptr[j+1]; ++i) {
+	    k = rowind[i];
+	    if ( marker[k] != j ) {
+		marker[k] = j;
+		(*b_rowind)[num_nz++] = k;
+	    }
+	}
+
+	/* Add pattern of column T_*k to B_*j */
+	for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) {
+	    k = t_rowind[i];
+	    if ( marker[k] != j ) {
+		marker[k] = j;
+		(*b_rowind)[num_nz++] = k;
+	    }
+	}
+    }
+    (*b_colptr)[n] = num_nz;
+       
+    SUPERLU_FREE(marker);
+    SUPERLU_FREE(t_colptr);
+    SUPERLU_FREE(t_rowind);
+}
+
+void
+get_perm_c(int ispec, SuperMatrix *A, int *perm_c)
+/*
+ * Purpose
+ * =======
+ *
+ * GET_PERM_C obtains a permutation matrix Pc, by applying the multiple
+ * minimum degree ordering code by Joseph Liu to matrix A'*A or A+A'.
+ * or using approximate minimum degree column ordering by Davis et. al.
+ * The LU factorization of A*Pc tends to have less fill than the LU 
+ * factorization of A.
+ *
+ * Arguments
+ * =========
+ *
+ * ispec   (input) int
+ *         Specifies the type of column ordering to reduce fill:
+ *         = 1: minimum degree on the structure of A^T * A
+ *         = 2: minimum degree on the structure of A^T + A
+ *         = 3: approximate minimum degree for unsymmetric matrices
+ *         If ispec == 0, the natural ordering (i.e., Pc = I) is returned.
+ * 
+ * A       (input) SuperMatrix*
+ *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
+ *         of the linear equations is A->nrow. Currently, the type of A 
+ *         can be: Stype = NC; Dtype = _D; Mtype = GE. In the future,
+ *         more general A can be handled.
+ *
+ * perm_c  (output) int*
+ *	   Column permutation vector of size A->ncol, which defines the 
+ *         permutation matrix Pc; perm_c[i] = j means column i of A is 
+ *         in position j in A*Pc.
+ *
+ */
+{
+    NCformat *Astore = A->Store;
+    int m, n, bnz, *b_colptr, i;
+    int delta, maxint, nofsub, *invp;
+    int *b_rowind, *dhead, *qsize, *llist, *marker;
+    double t, SuperLU_timer_();
+    
+    m = A->nrow;
+    n = A->ncol;
+
+    t = SuperLU_timer_();
+    switch ( ispec ) {
+        case 0: /* Natural ordering */
+	      for (i = 0; i < n; ++i) perm_c[i] = i;
+#if ( PRNTlevel>=1 )
+	      printf("Use natural column ordering.\n");
+#endif
+	      return;
+        case 1: /* Minimum degree ordering on A'*A */
+	      getata(m, n, Astore->nnz, Astore->colptr, Astore->rowind,
+		     &bnz, &b_colptr, &b_rowind);
+#if ( PRNTlevel>=1 )
+	      printf("Use minimum degree ordering on A'*A.\n");
+#endif
+	      t = SuperLU_timer_() - t;
+	      /*printf("Form A'*A time = %8.3f\n", t);*/
+	      break;
+        case 2: /* Minimum degree ordering on A'+A */
+	      if ( m != n ) ABORT("Matrix is not square");
+	      at_plus_a(n, Astore->nnz, Astore->colptr, Astore->rowind,
+			&bnz, &b_colptr, &b_rowind);
+#if ( PRNTlevel>=1 )
+	      printf("Use minimum degree ordering on A'+A.\n");
+#endif
+	      t = SuperLU_timer_() - t;
+	      /*printf("Form A'+A time = %8.3f\n", t);*/
+	      break;
+        case 3: /* Approximate minimum degree column ordering. */
+	      get_colamd(m, n, Astore->nnz, Astore->colptr, Astore->rowind,
+			 perm_c);
+#if ( PRNTlevel>=1 )
+	      printf(".. Use approximate minimum degree column ordering.\n");
+#endif
+	      return; 
+        default:
+	      ABORT("Invalid ISPEC");
+    }
+
+    if ( bnz != 0 ) {
+	t = SuperLU_timer_();
+
+	/* Initialize and allocate storage for GENMMD. */
+	delta = 1; /* DELTA is a parameter to allow the choice of nodes
+		      whose degree <= min-degree + DELTA. */
+	maxint = 2147483647; /* 2**31 - 1 */
+	invp = (int *) SUPERLU_MALLOC((n+delta)*sizeof(int));
+	if ( !invp ) ABORT("SUPERLU_MALLOC fails for invp.");
+	dhead = (int *) SUPERLU_MALLOC((n+delta)*sizeof(int));
+	if ( !dhead ) ABORT("SUPERLU_MALLOC fails for dhead.");
+	qsize = (int *) SUPERLU_MALLOC((n+delta)*sizeof(int));
+	if ( !qsize ) ABORT("SUPERLU_MALLOC fails for qsize.");
+	llist = (int *) SUPERLU_MALLOC(n*sizeof(int));
+	if ( !llist ) ABORT("SUPERLU_MALLOC fails for llist.");
+	marker = (int *) SUPERLU_MALLOC(n*sizeof(int));
+	if ( !marker ) ABORT("SUPERLU_MALLOC fails for marker.");
+
+	/* Transform adjacency list into 1-based indexing required by GENMMD.*/
+	for (i = 0; i <= n; ++i) ++b_colptr[i];
+	for (i = 0; i < bnz; ++i) ++b_rowind[i];
+	
+	genmmd_(&n, b_colptr, b_rowind, perm_c, invp, &delta, dhead, 
+		qsize, llist, marker, &maxint, &nofsub);
+
+	/* Transform perm_c into 0-based indexing. */
+	for (i = 0; i < n; ++i) --perm_c[i];
+
+	SUPERLU_FREE(b_colptr);
+	SUPERLU_FREE(b_rowind);
+	SUPERLU_FREE(invp);
+	SUPERLU_FREE(dhead);
+	SUPERLU_FREE(qsize);
+	SUPERLU_FREE(llist);
+	SUPERLU_FREE(marker);
+
+	t = SuperLU_timer_() - t;
+	/*  printf("call GENMMD time = %8.3f\n", t);*/
+
+    } else { /* Empty adjacency structure */
+	for (i = 0; i < n; ++i) perm_c[i] = i;
+    }
+
+}
diff --git a/intern/opennl/superlu/heap_relax_snode.c b/intern/opennl/superlu/heap_relax_snode.c
new file mode 100644
index 00000000000..86971f59571
--- /dev/null
+++ b/intern/opennl/superlu/heap_relax_snode.c
@@ -0,0 +1,116 @@
+/*
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ */
+/*
+  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ 
+  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ 
+  Permission is hereby granted to use or copy this program for any
+  purpose, provided the above notices are retained on all copies.
+  Permission to modify the code and to distribute modified code is
+  granted, provided the above notices are retained, and a notice that
+  the code was modified is included with the above copyright notice.
+*/
+
+#include "ssp_defs.h"
+
+void
+heap_relax_snode (
+	     const     int n,
+	     int       *et,           /* column elimination tree */
+	     const int relax_columns, /* max no of columns allowed in a
+					 relaxed snode */
+	     int       *descendants,  /* no of descendants of each node
+					 in the etree */
+	     int       *relax_end     /* last column in a supernode */
+	     )
+{
+/*
+ * Purpose
+ * =======
+ *    relax_snode() - Identify the initial relaxed supernodes, assuming that 
+ *    the matrix has been reordered according to the postorder of the etree.
+ *
+ */ 
+    register int i, j, k, l, parent;
+    register int snode_start;	/* beginning of a snode */
+    int *et_save, *post, *inv_post, *iwork;
+    int nsuper_et = 0, nsuper_et_post = 0;
+
+    /* The etree may not be postordered, but is heap ordered. */
+
+    iwork = (int*) intMalloc(3*n+2); 
+    if ( !iwork ) ABORT("SUPERLU_MALLOC fails for iwork[]");
+    inv_post = iwork + n+1;
+    et_save = inv_post + n+1;
+
+    /* Post order etree */
+    post = (int *) TreePostorder(n, et);
+    for (i = 0; i < n+1; ++i) inv_post[post[i]] = i;
+
+    /* Renumber etree in postorder */
+    for (i = 0; i < n; ++i) {
+        iwork[post[i]] = post[et[i]];
+	et_save[i] = et[i]; /* Save the original etree */
+    }
+    for (i = 0; i < n; ++i) et[i] = iwork[i];
+
+    /* Compute the number of descendants of each node in the etree */
+    ifill (relax_end, n, EMPTY);
+    for (j = 0; j < n; j++) descendants[j] = 0;
+    for (j = 0; j < n; j++) {
+	parent = et[j];
+	if ( parent != n )  /* not the dummy root */
+	    descendants[parent] += descendants[j] + 1;
+    }
+
+    /* Identify the relaxed supernodes by postorder traversal of the etree. */
+    for (j = 0; j < n; ) { 
+     	parent = et[j];
+        snode_start = j;
+ 	while ( parent != n && descendants[parent] < relax_columns ) {
+	    j = parent;
+	    parent = et[j];
+	}
+	/* Found a supernode in postordered etree; j is the last column. */
+	++nsuper_et_post;
+	k = n;
+	for (i = snode_start; i <= j; ++i)
+	    k = SUPERLU_MIN(k, inv_post[i]);
+	l = inv_post[j];
+	if ( (l - k) == (j - snode_start) ) {
+	    /* It's also a supernode in the original etree */
+	    relax_end[k] = l;		/* Last column is recorded */
+	    ++nsuper_et;
+	} else {
+	    for (i = snode_start; i <= j; ++i) {
+	        l = inv_post[i];
+	        if ( descendants[i] == 0 ) relax_end[l] = l;
+	    }
+	}
+	j++;
+	/* Search for a new leaf */
+	while ( descendants[j] != 0 && j < n ) j++;
+    }
+
+#if ( PRNTlevel>=1 )
+    printf(".. heap_snode_relax:\n"
+	   "\tNo of relaxed snodes in postordered etree:\t%d\n"
+	   "\tNo of relaxed snodes in original etree:\t%d\n",
+	   nsuper_et_post, nsuper_et);
+#endif
+
+    /* Recover the original etree */
+    for (i = 0; i < n; ++i) et[i] = et_save[i];
+
+    SUPERLU_FREE(post);
+    SUPERLU_FREE(iwork);
+}
+
+
diff --git a/intern/opennl/superlu/lsame.c b/intern/opennl/superlu/lsame.c
new file mode 100644
index 00000000000..29f27d38fa9
--- /dev/null
+++ b/intern/opennl/superlu/lsame.c
@@ -0,0 +1,70 @@
+int lsame_(char *ca, char *cb)
+{
+/*  -- LAPACK auxiliary routine (version 2.0) --   
+       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
+       Courant Institute, Argonne National Lab, and Rice University   
+       September 30, 1994   
+
+    Purpose   
+    =======   
+
+    LSAME returns .TRUE. if CA is the same letter as CB regardless of case.   
+
+    Arguments   
+    =========   
+
+    CA      (input) CHARACTER*1   
+    CB      (input) CHARACTER*1   
+            CA and CB specify the single characters to be compared.   
+
+   ===================================================================== 
+*/  
+
+    /* System generated locals */
+    int ret_val;
+    
+    /* Local variables */
+    int inta, intb, zcode;
+
+    ret_val = *(unsigned char *)ca == *(unsigned char *)cb;
+    if (ret_val) {
+	return ret_val;
+    }
+
+    /* Now test for equivalence if both characters are alphabetic. */
+
+    zcode = 'Z';
+
+    /* Use 'Z' rather than 'A' so that ASCII can be detected on Prime   
+       machines, on which ICHAR returns a value with bit 8 set.   
+       ICHAR('A') on Prime machines returns 193 which is the same as   
+       ICHAR('A') on an EBCDIC machine. */
+
+    inta = *(unsigned char *)ca;
+    intb = *(unsigned char *)cb;
+
+    if (zcode == 90 || zcode == 122) {
+	/* ASCII is assumed - ZCODE is the ASCII code of either lower or   
+          upper case 'Z'. */
+	if (inta >= 97 && inta <= 122) inta += -32;
+	if (intb >= 97 && intb <= 122) intb += -32;
+
+    } else if (zcode == 233 || zcode == 169) {
+	/* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or   
+          upper case 'Z'. */
+	if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || (inta 
+		>= 162 && inta <= 169))
+	    inta += 64;
+	if ((intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || (intb 
+		>= 162 && intb <= 169))
+	    intb += 64;
+    } else if (zcode == 218 || zcode == 250) {
+	/* ASCII is assumed, on Prime machines - ZCODE is the ASCII code   
+          plus 128 of either lower or upper case 'Z'. */
+	if (inta >= 225 && inta <= 250) inta += -32;
+	if (intb >= 225 && intb <= 250) intb += -32;
+    }
+    ret_val = inta == intb;
+    return ret_val;
+    
+} /* lsame_ */
diff --git a/intern/opennl/superlu/memory.c b/intern/opennl/superlu/memory.c
new file mode 100644
index 00000000000..54d863ea9e9
--- /dev/null
+++ b/intern/opennl/superlu/memory.c
@@ -0,0 +1,207 @@
+/*
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ */
+/** Precision-independent memory-related routines.
+    (Shared by [sdcz]memory.c) **/
+
+#include "ssp_defs.h"
+
+
+#if ( DEBUGlevel>=1 )           /* Debug malloc/free. */
+int superlu_malloc_total = 0;
+
+#define PAD_FACTOR  2
+#define DWORD  (sizeof(double)) /* Be sure it's no smaller than double. */
+
+void *superlu_malloc(size_t size)
+{
+    char *buf;
+
+    buf = (char *) malloc(size + DWORD);
+    if ( !buf ) {
+	printf("superlu_malloc fails: malloc_total %.0f MB, size %d\n",
+	       superlu_malloc_total*1e-6, size);
+	ABORT("superlu_malloc: out of memory");
+    }
+
+    ((int_t *) buf)[0] = size;
+#if 0
+    superlu_malloc_total += size + DWORD;
+#else
+    superlu_malloc_total += size;
+#endif
+    return (void *) (buf + DWORD);
+}
+
+void superlu_free(void *addr)
+{
+    char *p = ((char *) addr) - DWORD;
+
+    if ( !addr )
+	ABORT("superlu_free: tried to free NULL pointer");
+
+    if ( !p )
+	ABORT("superlu_free: tried to free NULL+DWORD pointer");
+
+    { 
+	int_t n = ((int_t *) p)[0];
+	
+	if ( !n )
+	    ABORT("superlu_free: tried to free a freed pointer");
+	*((int_t *) p) = 0; /* Set to zero to detect duplicate free's. */
+#if 0	
+	superlu_malloc_total -= (n + DWORD);
+#else
+	superlu_malloc_total -= n;
+#endif
+
+	if ( superlu_malloc_total < 0 )
+	    ABORT("superlu_malloc_total went negative!");
+	
+	/*free (addr);*/
+	free (p);
+    }
+
+}
+
+#else   /* production mode */
+
+void *superlu_malloc(size_t size)
+{
+    void *buf;
+    buf = (void *) malloc(size);
+    return (buf);
+}
+
+void superlu_free(void *addr)
+{
+    free (addr);
+}
+
+#endif
+
+
+/*
+ * Set up pointers for integer working arrays.
+ */
+void
+SetIWork(int m, int n, int panel_size, int *iworkptr, int **segrep,
+	 int **parent, int **xplore, int **repfnz, int **panel_lsub,
+	 int **xprune, int **marker)
+{
+    *segrep = iworkptr;
+    *parent = iworkptr + m;
+    *xplore = *parent + m;
+    *repfnz = *xplore + m;
+    *panel_lsub = *repfnz + panel_size * m;
+    *xprune = *panel_lsub + panel_size * m;
+    *marker = *xprune + n;
+    ifill (*repfnz, m * panel_size, EMPTY);
+    ifill (*panel_lsub, m * panel_size, EMPTY);
+}
+
+
+void
+copy_mem_int(int howmany, void *old, void *new)
+{
+    register int i;
+    int *iold = old;
+    int *inew = new;
+    for (i = 0; i < howmany; i++) inew[i] = iold[i];
+}
+
+
+void
+user_bcopy(char *src, char *dest, int bytes)
+{
+    char *s_ptr, *d_ptr;
+
+    s_ptr = src + bytes - 1;
+    d_ptr = dest + bytes - 1;
+    for (; d_ptr >= dest; --s_ptr, --d_ptr ) *d_ptr = *s_ptr;
+}
+
+
+
+int *intMalloc(int n)
+{
+    int *buf;
+    buf = (int *) SUPERLU_MALLOC(n * sizeof(int));
+    if ( !buf ) {
+	ABORT("SUPERLU_MALLOC fails for buf in intMalloc()");
+    }
+    return (buf);
+}
+
+int *intCalloc(int n)
+{
+    int *buf;
+    register int i;
+    buf = (int *) SUPERLU_MALLOC(n * sizeof(int));
+    if ( !buf ) {
+	ABORT("SUPERLU_MALLOC fails for buf in intCalloc()");
+    }
+    for (i = 0; i < n; ++i) buf[i] = 0;
+    return (buf);
+}
+
+
+
+#if 0
+check_expanders()
+{
+    int p;
+    printf("Check expanders:\n");
+    for (p = 0; p < NO_MEMTYPE; p++) {
+	printf("type %d, size %d, mem %d\n",
+	       p, expanders[p].size, (int)expanders[p].mem);
+    }
+
+    return 0;
+}
+
+
+StackInfo()
+{
+    printf("Stack: size %d, used %d, top1 %d, top2 %d\n",
+	   stack.size, stack.used, stack.top1, stack.top2);
+    return 0;
+}
+
+
+
+PrintStack(char *msg, GlobalLU_t *Glu)
+{
+    int i;
+    int *xlsub, *lsub, *xusub, *usub;
+
+    xlsub = Glu->xlsub;
+    lsub  = Glu->lsub;
+    xusub = Glu->xusub;
+    usub  = Glu->usub;
+
+    printf("%s\n", msg);
+    
+/*    printf("\nUCOL: ");
+    for (i = 0; i < xusub[ndim]; ++i)
+	printf("%f  ", ucol[i]);
+
+    printf("\nLSUB: ");
+    for (i = 0; i < xlsub[ndim]; ++i)
+	printf("%d  ", lsub[i]);
+
+    printf("\nUSUB: ");
+    for (i = 0; i < xusub[ndim]; ++i)
+	printf("%d  ", usub[i]);
+
+    printf("\n");*/
+    return 0;
+}   
+#endif
+
+
+
diff --git a/intern/opennl/superlu/mmd.c b/intern/opennl/superlu/mmd.c
new file mode 100644
index 00000000000..05f26ce0995
--- /dev/null
+++ b/intern/opennl/superlu/mmd.c
@@ -0,0 +1,1012 @@
+
+typedef int shortint;
+
+/* *************************************************************** */
+/* *************************************************************** */
+/* ****     GENMMD ..... MULTIPLE MINIMUM EXTERNAL DEGREE     **** */
+/* *************************************************************** */
+/* *************************************************************** */
+
+/*     AUTHOR - JOSEPH W.H. LIU */
+/*              DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */
+
+/*     PURPOSE - THIS ROUTINE IMPLEMENTS THE MINIMUM DEGREE */
+/*        ALGORITHM.  IT MAKES USE OF THE IMPLICIT REPRESENTATION */
+/*        OF ELIMINATION GRAPHS BY QUOTIENT GRAPHS, AND THE */
+/*        NOTION OF INDISTINGUISHABLE NODES.  IT ALSO IMPLEMENTS */
+/*        THE MODIFICATIONS BY MULTIPLE ELIMINATION AND MINIMUM */
+/*        EXTERNAL DEGREE. */
+/*        --------------------------------------------- */
+/*        CAUTION - THE ADJACENCY VECTOR ADJNCY WILL BE */
+/*        DESTROYED. */
+/*        --------------------------------------------- */
+
+/*     INPUT PARAMETERS - */
+/*        NEQNS  - NUMBER OF EQUATIONS. */
+/*        (XADJ,ADJNCY) - THE ADJACENCY STRUCTURE. */
+/*        DELTA  - TOLERANCE VALUE FOR MULTIPLE ELIMINATION. */
+/*        MAXINT - MAXIMUM MACHINE REPRESENTABLE (SHORT) INTEGER */
+/*                 (ANY SMALLER ESTIMATE WILL DO) FOR MARKING */
+/*                 NODES. */
+
+/*     OUTPUT PARAMETERS - */
+/*        PERM   - THE MINIMUM DEGREE ORDERING. */
+/*        INVP   - THE INVERSE OF PERM. */
+/*        NOFSUB - AN UPPER BOUND ON THE NUMBER OF NONZERO */
+/*                 SUBSCRIPTS FOR THE COMPRESSED STORAGE SCHEME. */
+
+/*     WORKING PARAMETERS - */
+/*        DHEAD  - VECTOR FOR HEAD OF DEGREE LISTS. */
+/*        INVP   - USED TEMPORARILY FOR DEGREE FORWARD LINK. */
+/*        PERM   - USED TEMPORARILY FOR DEGREE BACKWARD LINK. */
+/*        QSIZE  - VECTOR FOR SIZE OF SUPERNODES. */
+/*        LLIST  - VECTOR FOR TEMPORARY LINKED LISTS. */
+/*        MARKER - A TEMPORARY MARKER VECTOR. */
+
+/*     PROGRAM SUBROUTINES - */
+/*        MMDELM, MMDINT, MMDNUM, MMDUPD. */
+
+/* *************************************************************** */
+
+/* Subroutine */ int genmmd_(int *neqns, int *xadj, shortint *adjncy, 
+	shortint *invp, shortint *perm, int *delta, shortint *dhead, 
+	shortint *qsize, shortint *llist, shortint *marker, int *maxint, 
+	int *nofsub)
+{
+    /* System generated locals */
+    int i__1;
+
+    /* Local variables */
+    static int mdeg, ehead, i, mdlmt, mdnode;
+    extern /* Subroutine */ int mmdelm_(int *, int *, shortint *, 
+	    shortint *, shortint *, shortint *, shortint *, shortint *, 
+	    shortint *, int *, int *), mmdupd_(int *, int *, 
+	    int *, shortint *, int *, int *, shortint *, shortint 
+	    *, shortint *, shortint *, shortint *, shortint *, int *, 
+	    int *), mmdint_(int *, int *, shortint *, shortint *, 
+	    shortint *, shortint *, shortint *, shortint *, shortint *), 
+	    mmdnum_(int *, shortint *, shortint *, shortint *);
+    static int nextmd, tag, num;
+
+
+/* *************************************************************** */
+
+
+/* *************************************************************** */
+
+    /* Parameter adjustments */
+    --marker;
+    --llist;
+    --qsize;
+    --dhead;
+    --perm;
+    --invp;
+    --adjncy;
+    --xadj;
+
+    /* Function Body */
+    if (*neqns <= 0) {
+	return 0;
+    }
+
+/*        ------------------------------------------------ */
+/*        INITIALIZATION FOR THE MINIMUM DEGREE ALGORITHM. */
+/*        ------------------------------------------------ */
+    *nofsub = 0;
+    mmdint_(neqns, &xadj[1], &adjncy[1], &dhead[1], &invp[1], &perm[1], &
+	    qsize[1], &llist[1], &marker[1]);
+
+/*        ---------------------------------------------- */
+/*        NUM COUNTS THE NUMBER OF ORDERED NODES PLUS 1. */
+/*        ---------------------------------------------- */
+    num = 1;
+
+/*        ----------------------------- */
+/*        ELIMINATE ALL ISOLATED NODES. */
+/*        ----------------------------- */
+    nextmd = dhead[1];
+L100:
+    if (nextmd <= 0) {
+	goto L200;
+    }
+    mdnode = nextmd;
+    nextmd = invp[mdnode];
+    marker[mdnode] = *maxint;
+    invp[mdnode] = -num;
+    ++num;
+    goto L100;
+
+L200:
+/*        ---------------------------------------- */
+/*        SEARCH FOR NODE OF THE MINIMUM DEGREE. */
+/*        MDEG IS THE CURRENT MINIMUM DEGREE; */
+/*        TAG IS USED TO FACILITATE MARKING NODES. */
+/*        ---------------------------------------- */
+    if (num > *neqns) {
+	goto L1000;
+    }
+    tag = 1;
+    dhead[1] = 0;
+    mdeg = 2;
+L300:
+    if (dhead[mdeg] > 0) {
+	goto L400;
+    }
+    ++mdeg;
+    goto L300;
+L400:
+/*            ------------------------------------------------- */
+/*            USE VALUE OF DELTA TO SET UP MDLMT, WHICH GOVERNS */
+/*            WHEN A DEGREE UPDATE IS TO BE PERFORMED. */
+/*            ------------------------------------------------- */
+    mdlmt = mdeg + *delta;
+    ehead = 0;
+
+L500:
+    mdnode = dhead[mdeg];
+    if (mdnode > 0) {
+	goto L600;
+    }
+    ++mdeg;
+    if (mdeg > mdlmt) {
+	goto L900;
+    }
+    goto L500;
+L600:
+/*                ---------------------------------------- */
+/*                REMOVE MDNODE FROM THE DEGREE STRUCTURE. */
+/*                ---------------------------------------- */
+    nextmd = invp[mdnode];
+    dhead[mdeg] = nextmd;
+    if (nextmd > 0) {
+	perm[nextmd] = -mdeg;
+    }
+    invp[mdnode] = -num;
+    *nofsub = *nofsub + mdeg + qsize[mdnode] - 2;
+    if (num + qsize[mdnode] > *neqns) {
+	goto L1000;
+    }
+/*                ---------------------------------------------- */
+/*                ELIMINATE MDNODE AND PERFORM QUOTIENT GRAPH */
+/*                TRANSFORMATION.  RESET TAG VALUE IF NECESSARY. */
+/*                ---------------------------------------------- */
+    ++tag;
+    if (tag < *maxint) {
+	goto L800;
+    }
+    tag = 1;
+    i__1 = *neqns;
+    for (i = 1; i <= i__1; ++i) {
+	if (marker[i] < *maxint) {
+	    marker[i] = 0;
+	}
+/* L700: */
+    }
+L800:
+    mmdelm_(&mdnode, &xadj[1], &adjncy[1], &dhead[1], &invp[1], &perm[1], &
+	    qsize[1], &llist[1], &marker[1], maxint, &tag);
+    num += qsize[mdnode];
+    llist[mdnode] = ehead;
+    ehead = mdnode;
+    if (*delta >= 0) {
+	goto L500;
+    }
+L900:
+/*            ------------------------------------------- */
+/*            UPDATE DEGREES OF THE NODES INVOLVED IN THE */
+/*            MINIMUM DEGREE NODES ELIMINATION. */
+/*            ------------------------------------------- */
+    if (num > *neqns) {
+	goto L1000;
+    }
+    mmdupd_(&ehead, neqns, &xadj[1], &adjncy[1], delta, &mdeg, &dhead[1], &
+	    invp[1], &perm[1], &qsize[1], &llist[1], &marker[1], maxint, &tag)
+	    ;
+    goto L300;
+
+L1000:
+    mmdnum_(neqns, &perm[1], &invp[1], &qsize[1]);
+    return 0;
+
+} /* genmmd_ */
+
+/* *************************************************************** */
+/* *************************************************************** */
+/* ***     MMDINT ..... MULT MINIMUM DEGREE INITIALIZATION     *** */
+/* *************************************************************** */
+/* *************************************************************** */
+
+/*     AUTHOR - JOSEPH W.H. LIU */
+/*              DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */
+
+/*     PURPOSE - THIS ROUTINE PERFORMS INITIALIZATION FOR THE */
+/*        MULTIPLE ELIMINATION VERSION OF THE MINIMUM DEGREE */
+/*        ALGORITHM. */
+
+/*     INPUT PARAMETERS - */
+/*        NEQNS  - NUMBER OF EQUATIONS. */
+/*        (XADJ,ADJNCY) - ADJACENCY STRUCTURE. */
+
+/*     OUTPUT PARAMETERS - */
+/*        (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */
+/*        QSIZE  - SIZE OF SUPERNODE (INITIALIZED TO ONE). */
+/*        LLIST  - LINKED LIST. */
+/*        MARKER - MARKER VECTOR. */
+
+/* *************************************************************** */
+
+/* Subroutine */ int mmdint_(int *neqns, int *xadj, shortint *adjncy, 
+	shortint *dhead, shortint *dforw, shortint *dbakw, shortint *qsize, 
+	shortint *llist, shortint *marker)
+{
+    /* System generated locals */
+    int i__1;
+
+    /* Local variables */
+    static int ndeg, node, fnode;
+
+
+/* *************************************************************** */
+
+
+/* *************************************************************** */
+
+    /* Parameter adjustments */
+    --marker;
+    --llist;
+    --qsize;
+    --dbakw;
+    --dforw;
+    --dhead;
+    --adjncy;
+    --xadj;
+
+    /* Function Body */
+    i__1 = *neqns;
+    for (node = 1; node <= i__1; ++node) {
+	dhead[node] = 0;
+	qsize[node] = 1;
+	marker[node] = 0;
+	llist[node] = 0;
+/* L100: */
+    }
+/*        ------------------------------------------ */
+/*        INITIALIZE THE DEGREE DOUBLY LINKED LISTS. */
+/*        ------------------------------------------ */
+    i__1 = *neqns;
+    for (node = 1; node <= i__1; ++node) {
+	ndeg = xadj[node + 1] - xadj[node] + 1;
+	fnode = dhead[ndeg];
+	dforw[node] = fnode;
+	dhead[ndeg] = node;
+	if (fnode > 0) {
+	    dbakw[fnode] = node;
+	}
+	dbakw[node] = -ndeg;
+/* L200: */
+    }
+    return 0;
+
+} /* mmdint_ */
+
+/* *************************************************************** */
+/* *************************************************************** */
+/* **     MMDELM ..... MULTIPLE MINIMUM DEGREE ELIMINATION     *** */
+/* *************************************************************** */
+/* *************************************************************** */
+
+/*     AUTHOR - JOSEPH W.H. LIU */
+/*              DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */
+
+/*     PURPOSE - THIS ROUTINE ELIMINATES THE NODE MDNODE OF */
+/*        MINIMUM DEGREE FROM THE ADJACENCY STRUCTURE, WHICH */
+/*        IS STORED IN THE QUOTIENT GRAPH FORMAT.  IT ALSO */
+/*        TRANSFORMS THE QUOTIENT GRAPH REPRESENTATION OF THE */
+/*        ELIMINATION GRAPH. */
+
+/*     INPUT PARAMETERS - */
+/*        MDNODE - NODE OF MINIMUM DEGREE. */
+/*        MAXINT - ESTIMATE OF MAXIMUM REPRESENTABLE (SHORT) */
+/*                 INT. */
+/*        TAG    - TAG VALUE. */
+
+/*     UPDATED PARAMETERS - */
+/*        (XADJ,ADJNCY) - UPDATED ADJACENCY STRUCTURE. */
+/*        (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */
+/*        QSIZE  - SIZE OF SUPERNODE. */
+/*        MARKER - MARKER VECTOR. */
+/*        LLIST  - TEMPORARY LINKED LIST OF ELIMINATED NABORS. */
+
+/* *************************************************************** */
+
+/* Subroutine */ int mmdelm_(int *mdnode, int *xadj, shortint *adjncy,
+	 shortint *dhead, shortint *dforw, shortint *dbakw, shortint *qsize, 
+	shortint *llist, shortint *marker, int *maxint, int *tag)
+{
+    /* System generated locals */
+    int i__1, i__2;
+
+    /* Local variables */
+    static int node, link, rloc, rlmt, i, j, nabor, rnode, elmnt, xqnbr, 
+	    istop, jstop, istrt, jstrt, nxnode, pvnode, nqnbrs, npv;
+
+
+/* *************************************************************** */
+
+
+/* *************************************************************** */
+
+/*        ----------------------------------------------- */
+/*        FIND REACHABLE SET AND PLACE IN DATA STRUCTURE. */
+/*        ----------------------------------------------- */
+    /* Parameter adjustments */
+    --marker;
+    --llist;
+    --qsize;
+    --dbakw;
+    --dforw;
+    --dhead;
+    --adjncy;
+    --xadj;
+
+    /* Function Body */
+    marker[*mdnode] = *tag;
+    istrt = xadj[*mdnode];
+    istop = xadj[*mdnode + 1] - 1;
+/*        ------------------------------------------------------- */
+/*        ELMNT POINTS TO THE BEGINNING OF THE LIST OF ELIMINATED */
+/*        NABORS OF MDNODE, AND RLOC GIVES THE STORAGE LOCATION */
+/*        FOR THE NEXT REACHABLE NODE. */
+/*        ------------------------------------------------------- */
+    elmnt = 0;
+    rloc = istrt;
+    rlmt = istop;
+    i__1 = istop;
+    for (i = istrt; i <= i__1; ++i) {
+	nabor = adjncy[i];
+	if (nabor == 0) {
+	    goto L300;
+	}
+	if (marker[nabor] >= *tag) {
+	    goto L200;
+	}
+	marker[nabor] = *tag;
+	if (dforw[nabor] < 0) {
+	    goto L100;
+	}
+	adjncy[rloc] = nabor;
+	++rloc;
+	goto L200;
+L100:
+	llist[nabor] = elmnt;
+	elmnt = nabor;
+L200:
+	;
+    }
+L300:
+/*            ----------------------------------------------------- */
+/*            MERGE WITH REACHABLE NODES FROM GENERALIZED ELEMENTS. */
+/*            ----------------------------------------------------- */
+    if (elmnt <= 0) {
+	goto L1000;
+    }
+    adjncy[rlmt] = -elmnt;
+    link = elmnt;
+L400:
+    jstrt = xadj[link];
+    jstop = xadj[link + 1] - 1;
+    i__1 = jstop;
+    for (j = jstrt; j <= i__1; ++j) {
+	node = adjncy[j];
+	link = -node;
+	if (node < 0) {
+	    goto L400;
+	} else if (node == 0) {
+	    goto L900;
+	} else {
+	    goto L500;
+	}
+L500:
+	if (marker[node] >= *tag || dforw[node] < 0) {
+	    goto L800;
+	}
+	marker[node] = *tag;
+/*                            --------------------------------- */
+/*                            USE STORAGE FROM ELIMINATED NODES */
+/*                            IF NECESSARY. */
+/*                            --------------------------------- */
+L600:
+	if (rloc < rlmt) {
+	    goto L700;
+	}
+	link = -adjncy[rlmt];
+	rloc = xadj[link];
+	rlmt = xadj[link + 1] - 1;
+	goto L600;
+L700:
+	adjncy[rloc] = node;
+	++rloc;
+L800:
+	;
+    }
+L900:
+    elmnt = llist[elmnt];
+    goto L300;
+L1000:
+    if (rloc <= rlmt) {
+	adjncy[rloc] = 0;
+    }
+/*        -------------------------------------------------------- */
+/*        FOR EACH NODE IN THE REACHABLE SET, DO THE FOLLOWING ... */
+/*        -------------------------------------------------------- */
+    link = *mdnode;
+L1100:
+    istrt = xadj[link];
+    istop = xadj[link + 1] - 1;
+    i__1 = istop;
+    for (i = istrt; i <= i__1; ++i) {
+	rnode = adjncy[i];
+	link = -rnode;
+	if (rnode < 0) {
+	    goto L1100;
+	} else if (rnode == 0) {
+	    goto L1800;
+	} else {
+	    goto L1200;
+	}
+L1200:
+/*                -------------------------------------------- */
+/*                IF RNODE IS IN THE DEGREE LIST STRUCTURE ... */
+/*                -------------------------------------------- */
+	pvnode = dbakw[rnode];
+	if (pvnode == 0 || pvnode == -(*maxint)) {
+	    goto L1300;
+	}
+/*                    ------------------------------------- */
+/*                    THEN REMOVE RNODE FROM THE STRUCTURE. */
+/*                    ------------------------------------- */
+	nxnode = dforw[rnode];
+	if (nxnode > 0) {
+	    dbakw[nxnode] = pvnode;
+	}
+	if (pvnode > 0) {
+	    dforw[pvnode] = nxnode;
+	}
+	npv = -pvnode;
+	if (pvnode < 0) {
+	    dhead[npv] = nxnode;
+	}
+L1300:
+/*                ---------------------------------------- */
+/*                PURGE INACTIVE QUOTIENT NABORS OF RNODE. */
+/*                ---------------------------------------- */
+	jstrt = xadj[rnode];
+	jstop = xadj[rnode + 1] - 1;
+	xqnbr = jstrt;
+	i__2 = jstop;
+	for (j = jstrt; j <= i__2; ++j) {
+	    nabor = adjncy[j];
+	    if (nabor == 0) {
+		goto L1500;
+	    }
+	    if (marker[nabor] >= *tag) {
+		goto L1400;
+	    }
+	    adjncy[xqnbr] = nabor;
+	    ++xqnbr;
+L1400:
+	    ;
+	}
+L1500:
+/*                ---------------------------------------- */
+/*                IF NO ACTIVE NABOR AFTER THE PURGING ... */
+/*                ---------------------------------------- */
+	nqnbrs = xqnbr - jstrt;
+	if (nqnbrs > 0) {
+	    goto L1600;
+	}
+/*                    ----------------------------- */
+/*                    THEN MERGE RNODE WITH MDNODE. */
+/*                    ----------------------------- */
+	qsize[*mdnode] += qsize[rnode];
+	qsize[rnode] = 0;
+	marker[rnode] = *maxint;
+	dforw[rnode] = -(*mdnode);
+	dbakw[rnode] = -(*maxint);
+	goto L1700;
+L1600:
+/*                -------------------------------------- */
+/*                ELSE FLAG RNODE FOR DEGREE UPDATE, AND */
+/*                ADD MDNODE AS A NABOR OF RNODE. */
+/*                -------------------------------------- */
+	dforw[rnode] = nqnbrs + 1;
+	dbakw[rnode] = 0;
+	adjncy[xqnbr] = *mdnode;
+	++xqnbr;
+	if (xqnbr <= jstop) {
+	    adjncy[xqnbr] = 0;
+	}
+
+L1700:
+	;
+    }
+L1800:
+    return 0;
+
+} /* mmdelm_ */
+
+/* *************************************************************** */
+/* *************************************************************** */
+/* *****     MMDUPD ..... MULTIPLE MINIMUM DEGREE UPDATE     ***** */
+/* *************************************************************** */
+/* *************************************************************** */
+
+/*     AUTHOR - JOSEPH W.H. LIU */
+/*              DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */
+
+/*     PURPOSE - THIS ROUTINE UPDATES THE DEGREES OF NODES */
+/*        AFTER A MULTIPLE ELIMINATION STEP. */
+
+/*     INPUT PARAMETERS - */
+/*        EHEAD  - THE BEGINNING OF THE LIST OF ELIMINATED */
+/*                 NODES (I.E., NEWLY FORMED ELEMENTS). */
+/*        NEQNS  - NUMBER OF EQUATIONS. */
+/*        (XADJ,ADJNCY) - ADJACENCY STRUCTURE. */
+/*        DELTA  - TOLERANCE VALUE FOR MULTIPLE ELIMINATION. */
+/*        MAXINT - MAXIMUM MACHINE REPRESENTABLE (SHORT) */
+/*                 INTEGER. */
+
+/*     UPDATED PARAMETERS - */
+/*        MDEG   - NEW MINIMUM DEGREE AFTER DEGREE UPDATE. */
+/*        (DHEAD,DFORW,DBAKW) - DEGREE DOUBLY LINKED STRUCTURE. */
+/*        QSIZE  - SIZE OF SUPERNODE. */
+/*        LLIST  - WORKING LINKED LIST. */
+/*        MARKER - MARKER VECTOR FOR DEGREE UPDATE. */
+/*        TAG    - TAG VALUE. */
+
+/* *************************************************************** */
+
+/* Subroutine */ int mmdupd_(int *ehead, int *neqns, int *xadj, 
+	shortint *adjncy, int *delta, int *mdeg, shortint *dhead, 
+	shortint *dforw, shortint *dbakw, shortint *qsize, shortint *llist, 
+	shortint *marker, int *maxint, int *tag)
+{
+    /* System generated locals */
+    int i__1, i__2;
+
+    /* Local variables */
+    static int node, mtag, link, mdeg0, i, j, enode, fnode, nabor, elmnt, 
+	    istop, jstop, q2head, istrt, jstrt, qxhead, iq2, deg, deg0;
+
+
+/* *************************************************************** */
+
+
+/* *************************************************************** */
+
+    /* Parameter adjustments */
+    --marker;
+    --llist;
+    --qsize;
+    --dbakw;
+    --dforw;
+    --dhead;
+    --adjncy;
+    --xadj;
+
+    /* Function Body */
+    mdeg0 = *mdeg + *delta;
+    elmnt = *ehead;
+L100:
+/*            ------------------------------------------------------- */
+/*            FOR EACH OF THE NEWLY FORMED ELEMENT, DO THE FOLLOWING. */
+/*            (RESET TAG VALUE IF NECESSARY.) */
+/*            ------------------------------------------------------- */
+    if (elmnt <= 0) {
+	return 0;
+    }
+    mtag = *tag + mdeg0;
+    if (mtag < *maxint) {
+	goto L300;
+    }
+    *tag = 1;
+    i__1 = *neqns;
+    for (i = 1; i <= i__1; ++i) {
+	if (marker[i] < *maxint) {
+	    marker[i] = 0;
+	}
+/* L200: */
+    }
+    mtag = *tag + mdeg0;
+L300:
+/*            --------------------------------------------- */
+/*            CREATE TWO LINKED LISTS FROM NODES ASSOCIATED */
+/*            WITH ELMNT: ONE WITH TWO NABORS (Q2HEAD) IN */
+/*            ADJACENCY STRUCTURE, AND THE OTHER WITH MORE */
+/*            THAN TWO NABORS (QXHEAD).  ALSO COMPUTE DEG0, */
+/*            NUMBER OF NODES IN THIS ELEMENT. */
+/*            --------------------------------------------- */
+    q2head = 0;
+    qxhead = 0;
+    deg0 = 0;
+    link = elmnt;
+L400:
+    istrt = xadj[link];
+    istop = xadj[link + 1] - 1;
+    i__1 = istop;
+    for (i = istrt; i <= i__1; ++i) {
+	enode = adjncy[i];
+	link = -enode;
+	if (enode < 0) {
+	    goto L400;
+	} else if (enode == 0) {
+	    goto L800;
+	} else {
+	    goto L500;
+	}
+
+L500:
+	if (qsize[enode] == 0) {
+	    goto L700;
+	}
+	deg0 += qsize[enode];
+	marker[enode] = mtag;
+/*                        ---------------------------------- */
+/*                        IF ENODE REQUIRES A DEGREE UPDATE, */
+/*                        THEN DO THE FOLLOWING. */
+/*                        ---------------------------------- */
+	if (dbakw[enode] != 0) {
+	    goto L700;
+	}
+/*                            --------------------------------------- 
+*/
+/*                            PLACE EITHER IN QXHEAD OR Q2HEAD LISTS. 
+*/
+/*                            --------------------------------------- 
+*/
+	if (dforw[enode] == 2) {
+	    goto L600;
+	}
+	llist[enode] = qxhead;
+	qxhead = enode;
+	goto L700;
+L600:
+	llist[enode] = q2head;
+	q2head = enode;
+L700:
+	;
+    }
+L800:
+/*            -------------------------------------------- */
+/*            FOR EACH ENODE IN Q2 LIST, DO THE FOLLOWING. */
+/*            -------------------------------------------- */
+    enode = q2head;
+    iq2 = 1;
+L900:
+    if (enode <= 0) {
+	goto L1500;
+    }
+    if (dbakw[enode] != 0) {
+	goto L2200;
+    }
+    ++(*tag);
+    deg = deg0;
+/*                    ------------------------------------------ */
+/*                    IDENTIFY THE OTHER ADJACENT ELEMENT NABOR. */
+/*                    ------------------------------------------ */
+    istrt = xadj[enode];
+    nabor = adjncy[istrt];
+    if (nabor == elmnt) {
+	nabor = adjncy[istrt + 1];
+    }
+/*                    ------------------------------------------------ */
+/*                    IF NABOR IS UNELIMINATED, INCREASE DEGREE COUNT. */
+/*                    ------------------------------------------------ */
+    link = nabor;
+    if (dforw[nabor] < 0) {
+	goto L1000;
+    }
+    deg += qsize[nabor];
+    goto L2100;
+L1000:
+/*                        -------------------------------------------- */
+/*                        OTHERWISE, FOR EACH NODE IN THE 2ND ELEMENT, */
+/*                        DO THE FOLLOWING. */
+/*                        -------------------------------------------- */
+    istrt = xadj[link];
+    istop = xadj[link + 1] - 1;
+    i__1 = istop;
+    for (i = istrt; i <= i__1; ++i) {
+	node = adjncy[i];
+	link = -node;
+	if (node == enode) {
+	    goto L1400;
+	}
+	if (node < 0) {
+	    goto L1000;
+	} else if (node == 0) {
+	    goto L2100;
+	} else {
+	    goto L1100;
+	}
+
+L1100:
+	if (qsize[node] == 0) {
+	    goto L1400;
+	}
+	if (marker[node] >= *tag) {
+	    goto L1200;
+	}
+/*                                -----------------------------------
+-- */
+/*                                CASE WHEN NODE IS NOT YET CONSIDERED
+. */
+/*                                -----------------------------------
+-- */
+	marker[node] = *tag;
+	deg += qsize[node];
+	goto L1400;
+L1200:
+/*                            ----------------------------------------
+ */
+/*                            CASE WHEN NODE IS INDISTINGUISHABLE FROM
+ */
+/*                            ENODE.  MERGE THEM INTO A NEW SUPERNODE.
+ */
+/*                            ----------------------------------------
+ */
+	if (dbakw[node] != 0) {
+	    goto L1400;
+	}
+	if (dforw[node] != 2) {
+	    goto L1300;
+	}
+	qsize[enode] += qsize[node];
+	qsize[node] = 0;
+	marker[node] = *maxint;
+	dforw[node] = -enode;
+	dbakw[node] = -(*maxint);
+	goto L1400;
+L1300:
+/*                            -------------------------------------- 
+*/
+/*                            CASE WHEN NODE IS OUTMATCHED BY ENODE. 
+*/
+/*                            -------------------------------------- 
+*/
+	if (dbakw[node] == 0) {
+	    dbakw[node] = -(*maxint);
+	}
+L1400:
+	;
+    }
+    goto L2100;
+L1500:
+/*                ------------------------------------------------ */
+/*                FOR EACH ENODE IN THE QX LIST, DO THE FOLLOWING. */
+/*                ------------------------------------------------ */
+    enode = qxhead;
+    iq2 = 0;
+L1600:
+    if (enode <= 0) {
+	goto L2300;
+    }
+    if (dbakw[enode] != 0) {
+	goto L2200;
+    }
+    ++(*tag);
+    deg = deg0;
+/*                        --------------------------------- */
+/*                        FOR EACH UNMARKED NABOR OF ENODE, */
+/*                        DO THE FOLLOWING. */
+/*                        --------------------------------- */
+    istrt = xadj[enode];
+    istop = xadj[enode + 1] - 1;
+    i__1 = istop;
+    for (i = istrt; i <= i__1; ++i) {
+	nabor = adjncy[i];
+	if (nabor == 0) {
+	    goto L2100;
+	}
+	if (marker[nabor] >= *tag) {
+	    goto L2000;
+	}
+	marker[nabor] = *tag;
+	link = nabor;
+/*                                ------------------------------ */
+/*                                IF UNELIMINATED, INCLUDE IT IN */
+/*                                DEG COUNT. */
+/*                                ------------------------------ */
+	if (dforw[nabor] < 0) {
+	    goto L1700;
+	}
+	deg += qsize[nabor];
+	goto L2000;
+L1700:
+/*                                    ------------------------------- 
+*/
+/*                                    IF ELIMINATED, INCLUDE UNMARKED 
+*/
+/*                                    NODES IN THIS ELEMENT INTO THE 
+*/
+/*                                    DEGREE COUNT. */
+/*                                    ------------------------------- 
+*/
+	jstrt = xadj[link];
+	jstop = xadj[link + 1] - 1;
+	i__2 = jstop;
+	for (j = jstrt; j <= i__2; ++j) {
+	    node = adjncy[j];
+	    link = -node;
+	    if (node < 0) {
+		goto L1700;
+	    } else if (node == 0) {
+		goto L2000;
+	    } else {
+		goto L1800;
+	    }
+
+L1800:
+	    if (marker[node] >= *tag) {
+		goto L1900;
+	    }
+	    marker[node] = *tag;
+	    deg += qsize[node];
+L1900:
+	    ;
+	}
+L2000:
+	;
+    }
+L2100:
+/*                    ------------------------------------------- */
+/*                    UPDATE EXTERNAL DEGREE OF ENODE IN DEGREE */
+/*                    STRUCTURE, AND MDEG (MIN DEG) IF NECESSARY. */
+/*                    ------------------------------------------- */
+    deg = deg - qsize[enode] + 1;
+    fnode = dhead[deg];
+    dforw[enode] = fnode;
+    dbakw[enode] = -deg;
+    if (fnode > 0) {
+	dbakw[fnode] = enode;
+    }
+    dhead[deg] = enode;
+    if (deg < *mdeg) {
+	*mdeg = deg;
+    }
+L2200:
+/*                    ---------------------------------- */
+/*                    GET NEXT ENODE IN CURRENT ELEMENT. */
+/*                    ---------------------------------- */
+    enode = llist[enode];
+    if (iq2 == 1) {
+	goto L900;
+    }
+    goto L1600;
+L2300:
+/*            ----------------------------- */
+/*            GET NEXT ELEMENT IN THE LIST. */
+/*            ----------------------------- */
+    *tag = mtag;
+    elmnt = llist[elmnt];
+    goto L100;
+
+} /* mmdupd_ */
+
+/* *************************************************************** */
+/* *************************************************************** */
+/* *****     MMDNUM ..... MULTI MINIMUM DEGREE NUMBERING     ***** */
+/* *************************************************************** */
+/* *************************************************************** */
+
+/*     AUTHOR - JOSEPH W.H. LIU */
+/*              DEPT OF COMPUTER SCIENCE, YORK UNIVERSITY. */
+
+/*     PURPOSE - THIS ROUTINE PERFORMS THE FINAL STEP IN */
+/*        PRODUCING THE PERMUTATION AND INVERSE PERMUTATION */
+/*        VECTORS IN THE MULTIPLE ELIMINATION VERSION OF THE */
+/*        MINIMUM DEGREE ORDERING ALGORITHM. */
+
+/*     INPUT PARAMETERS - */
+/*        NEQNS  - NUMBER OF EQUATIONS. */
+/*        QSIZE  - SIZE OF SUPERNODES AT ELIMINATION. */
+
+/*     UPDATED PARAMETERS - */
+/*        INVP   - INVERSE PERMUTATION VECTOR.  ON INPUT, */
+/*                 IF QSIZE(NODE)=0, THEN NODE HAS BEEN MERGED */
+/*                 INTO THE NODE -INVP(NODE); OTHERWISE, */
+/*                 -INVP(NODE) IS ITS INVERSE LABELLING. */
+
+/*     OUTPUT PARAMETERS - */
+/*        PERM   - THE PERMUTATION VECTOR. */
+
+/* *************************************************************** */
+
+/* Subroutine */ int mmdnum_(int *neqns, shortint *perm, shortint *invp, 
+	shortint *qsize)
+{
+    /* System generated locals */
+    int i__1;
+
+    /* Local variables */
+    static int node, root, nextf, father, nqsize, num;
+
+
+/* *************************************************************** */
+
+
+/* *************************************************************** */
+
+    /* Parameter adjustments */
+    --qsize;
+    --invp;
+    --perm;
+
+    /* Function Body */
+    i__1 = *neqns;
+    for (node = 1; node <= i__1; ++node) {
+	nqsize = qsize[node];
+	if (nqsize <= 0) {
+	    perm[node] = invp[node];
+	}
+	if (nqsize > 0) {
+	    perm[node] = -invp[node];
+	}
+/* L100: */
+    }
+/*        ------------------------------------------------------ */
+/*        FOR EACH NODE WHICH HAS BEEN MERGED, DO THE FOLLOWING. */
+/*        ------------------------------------------------------ */
+    i__1 = *neqns;
+    for (node = 1; node <= i__1; ++node) {
+	if (perm[node] > 0) {
+	    goto L500;
+	}
+/*                ----------------------------------------- */
+/*                TRACE THE MERGED TREE UNTIL ONE WHICH HAS */
+/*                NOT BEEN MERGED, CALL IT ROOT. */
+/*                ----------------------------------------- */
+	father = node;
+L200:
+	if (perm[father] > 0) {
+	    goto L300;
+	}
+	father = -perm[father];
+	goto L200;
+L300:
+/*                ----------------------- */
+/*                NUMBER NODE AFTER ROOT. */
+/*                ----------------------- */
+	root = father;
+	num = perm[root] + 1;
+	invp[node] = -num;
+	perm[root] = num;
+/*                ------------------------ */
+/*                SHORTEN THE MERGED TREE. */
+/*                ------------------------ */
+	father = node;
+L400:
+	nextf = -perm[father];
+	if (nextf <= 0) {
+	    goto L500;
+	}
+	perm[father] = -root;
+	father = nextf;
+	goto L400;
+L500:
+	;
+    }
+/*        ---------------------- */
+/*        READY TO COMPUTE PERM. */
+/*        ---------------------- */
+    i__1 = *neqns;
+    for (node = 1; node <= i__1; ++node) {
+	num = -invp[node];
+	invp[node] = num;
+	perm[num] = node;
+/* L600: */
+    }
+    return 0;
+
+} /* mmdnum_ */
+
diff --git a/intern/opennl/superlu/relax_snode.c b/intern/opennl/superlu/relax_snode.c
new file mode 100644
index 00000000000..549f3fcf873
--- /dev/null
+++ b/intern/opennl/superlu/relax_snode.c
@@ -0,0 +1,71 @@
+/*
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ */
+/*
+  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ 
+  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ 
+  Permission is hereby granted to use or copy this program for any
+  purpose, provided the above notices are retained on all copies.
+  Permission to modify the code and to distribute modified code is
+  granted, provided the above notices are retained, and a notice that
+  the code was modified is included with the above copyright notice.
+*/
+
+#include "ssp_defs.h"
+
+void
+relax_snode (
+	     const     int n,
+	     int       *et,           /* column elimination tree */
+	     const int relax_columns, /* max no of columns allowed in a
+					 relaxed snode */
+	     int       *descendants,  /* no of descendants of each node
+					 in the etree */
+	     int       *relax_end     /* last column in a supernode */
+	     )
+{
+/*
+ * Purpose
+ * =======
+ *    relax_snode() - Identify the initial relaxed supernodes, assuming that 
+ *    the matrix has been reordered according to the postorder of the etree.
+ *
+ */ 
+    register int j, parent;
+    register int snode_start;	/* beginning of a snode */
+    
+    ifill (relax_end, n, EMPTY);
+    for (j = 0; j < n; j++) descendants[j] = 0;
+
+    /* Compute the number of descendants of each node in the etree */
+    for (j = 0; j < n; j++) {
+	parent = et[j];
+	if ( parent != n )  /* not the dummy root */
+	    descendants[parent] += descendants[j] + 1;
+    }
+
+    /* Identify the relaxed supernodes by postorder traversal of the etree. */
+    for (j = 0; j < n; ) { 
+     	parent = et[j];
+        snode_start = j;
+ 	while ( parent != n && descendants[parent] < relax_columns ) {
+	    j = parent;
+	    parent = et[j];
+	}
+	/* Found a supernode with j being the last column. */
+	relax_end[snode_start] = j;		/* Last column is recorded */
+	j++;
+	/* Search for a new leaf */
+	while ( descendants[j] != 0 && j < n ) j++;
+    }
+
+    /*printf("No of relaxed snodes: %d; relaxed columns: %d\n", 
+		nsuper, no_relaxed_col); */
+}
diff --git a/intern/opennl/superlu/scolumn_bmod.c b/intern/opennl/superlu/scolumn_bmod.c
new file mode 100644
index 00000000000..c877a27dd53
--- /dev/null
+++ b/intern/opennl/superlu/scolumn_bmod.c
@@ -0,0 +1,353 @@
+
+/*
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ */
+/*
+  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ 
+  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ 
+  Permission is hereby granted to use or copy this program for any
+  purpose, provided the above notices are retained on all copies.
+  Permission to modify the code and to distribute modified code is
+  granted, provided the above notices are retained, and a notice that
+  the code was modified is included with the above copyright notice.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "ssp_defs.h"
+
+/* 
+ * Function prototypes 
+ */
+void susolve(int, int, float*, float*);
+void slsolve(int, int, float*, float*);
+void smatvec(int, int, int, float*, float*, float*);
+
+
+
+/* Return value:   0 - successful return
+ *               > 0 - number of bytes allocated when run out of space
+ */
+int
+scolumn_bmod (
+	     const int  jcol,	  /* in */
+	     const int  nseg,	  /* in */
+	     float     *dense,	  /* in */
+	     float     *tempv,	  /* working array */
+	     int        *segrep,  /* in */
+	     int        *repfnz,  /* in */
+	     int        fpanelc,  /* in -- first column in the current panel */
+	     GlobalLU_t *Glu,     /* modified */
+	     SuperLUStat_t *stat  /* output */
+	     )
+{
+/*
+ * Purpose:
+ * ========
+ *    Performs numeric block updates (sup-col) in topological order.
+ *    It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
+ *    Special processing on the supernodal portion of L\U[*,j]
+ *
+ */
+#ifdef _CRAY
+    _fcd ftcs1 = _cptofcd("L", strlen("L")),
+         ftcs2 = _cptofcd("N", strlen("N")),
+         ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+
+#ifdef USE_VENDOR_BLAS
+    int         incx = 1, incy = 1;
+    float      alpha, beta;
+#endif
+    
+    /* krep = representative of current k-th supernode
+     * fsupc = first supernodal column
+     * nsupc = no of columns in supernode
+     * nsupr = no of rows in supernode (used as leading dimension)
+     * luptr = location of supernodal LU-block in storage
+     * kfnz = first nonz in the k-th supernodal segment
+     * no_zeros = no of leading zeros in a supernodal U-segment
+     */
+    float       ukj, ukj1, ukj2;
+    int          luptr, luptr1, luptr2;
+    int          fsupc, nsupc, nsupr, segsze;
+    int          nrow;	  /* No of rows in the matrix of matrix-vector */
+    int          jcolp1, jsupno, k, ksub, krep, krep_ind, ksupno;
+    register int lptr, kfnz, isub, irow, i;
+    register int no_zeros, new_next; 
+    int          ufirst, nextlu;
+    int          fst_col; /* First column within small LU update */
+    int          d_fsupc; /* Distance between the first column of the current
+			     panel and the first column of the current snode. */
+    int          *xsup, *supno;
+    int          *lsub, *xlsub;
+    float       *lusup;
+    int          *xlusup;
+    int          nzlumax;
+    float       *tempv1;
+    float      zero = 0.0;
+#ifdef USE_VENDOR_BLAS
+    float      one = 1.0;
+    float      none = -1.0;
+#endif
+    int          mem_error;
+    flops_t      *ops = stat->ops;
+
+    xsup    = Glu->xsup;
+    supno   = Glu->supno;
+    lsub    = Glu->lsub;
+    xlsub   = Glu->xlsub;
+    lusup   = Glu->lusup;
+    xlusup  = Glu->xlusup;
+    nzlumax = Glu->nzlumax;
+    jcolp1 = jcol + 1;
+    jsupno = supno[jcol];
+    
+    /* 
+     * For each nonz supernode segment of U[*,j] in topological order 
+     */
+    k = nseg - 1;
+    for (ksub = 0; ksub < nseg; ksub++) {
+
+	krep = segrep[k];
+	k--;
+	ksupno = supno[krep];
+	if ( jsupno != ksupno ) { /* Outside the rectangular supernode */
+
+	    fsupc = xsup[ksupno];
+	    fst_col = SUPERLU_MAX ( fsupc, fpanelc );
+
+  	    /* Distance from the current supernode to the current panel; 
+	       d_fsupc=0 if fsupc > fpanelc. */
+  	    d_fsupc = fst_col - fsupc; 
+
+	    luptr = xlusup[fst_col] + d_fsupc;
+	    lptr = xlsub[fsupc] + d_fsupc;
+
+	    kfnz = repfnz[krep];
+	    kfnz = SUPERLU_MAX ( kfnz, fpanelc );
+
+	    segsze = krep - kfnz + 1;
+	    nsupc = krep - fst_col + 1;
+	    nsupr = xlsub[fsupc+1] - xlsub[fsupc];	/* Leading dimension */
+	    nrow = nsupr - d_fsupc - nsupc;
+	    krep_ind = lptr + nsupc - 1;
+
+	    ops[TRSV] += segsze * (segsze - 1);
+	    ops[GEMV] += 2 * nrow * segsze;
+
+
+	    /* 
+	     * Case 1: Update U-segment of size 1 -- col-col update 
+	     */
+	    if ( segsze == 1 ) {
+	  	ukj = dense[lsub[krep_ind]];
+		luptr += nsupr*(nsupc-1) + nsupc;
+
+		for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
+		    irow = lsub[i];
+		    dense[irow] -=  ukj*lusup[luptr];
+		    luptr++;
+		}
+
+	    } else if ( segsze <= 3 ) {
+		ukj = dense[lsub[krep_ind]];
+		luptr += nsupr*(nsupc-1) + nsupc-1;
+		ukj1 = dense[lsub[krep_ind - 1]];
+		luptr1 = luptr - nsupr;
+
+		if ( segsze == 2 ) { /* Case 2: 2cols-col update */
+		    ukj -= ukj1 * lusup[luptr1];
+		    dense[lsub[krep_ind]] = ukj;
+		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
+		    	irow = lsub[i];
+		    	luptr++;
+		    	luptr1++;
+		    	dense[irow] -= ( ukj*lusup[luptr]
+					+ ukj1*lusup[luptr1] );
+		    }
+		} else { /* Case 3: 3cols-col update */
+		    ukj2 = dense[lsub[krep_ind - 2]];
+		    luptr2 = luptr1 - nsupr;
+		    ukj1 -= ukj2 * lusup[luptr2-1];
+		    ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
+		    dense[lsub[krep_ind]] = ukj;
+		    dense[lsub[krep_ind-1]] = ukj1;
+		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
+		    	irow = lsub[i];
+		    	luptr++;
+		    	luptr1++;
+			luptr2++;
+		    	dense[irow] -= ( ukj*lusup[luptr]
+			     + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
+		    }
+		}
+
+
+
+	    } else {
+	  	/*
+		 * Case: sup-col update
+		 * Perform a triangular solve and block update,
+		 * then scatter the result of sup-col update to dense
+		 */
+
+		no_zeros = kfnz - fst_col;
+
+	        /* Copy U[*,j] segment from dense[*] to tempv[*] */
+	        isub = lptr + no_zeros;
+	        for (i = 0; i < segsze; i++) {
+	  	    irow = lsub[isub];
+		    tempv[i] = dense[irow];
+		    ++isub; 
+	        }
+
+	        /* Dense triangular solve -- start effective triangle */
+		luptr += nsupr * no_zeros + no_zeros; 
+		
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+		STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
+		       &nsupr, tempv, &incx );
+#else		
+		strsv_( "L", "N", "U", &segsze, &lusup[luptr], 
+		       &nsupr, tempv, &incx );
+#endif		
+ 		luptr += segsze;  /* Dense matrix-vector */
+		tempv1 = &tempv[segsze];
+                alpha = one;
+                beta = zero;
+#ifdef _CRAY
+		SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], 
+		       &nsupr, tempv, &incx, &beta, tempv1, &incy );
+#else
+		sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], 
+		       &nsupr, tempv, &incx, &beta, tempv1, &incy );
+#endif
+#else
+		slsolve ( nsupr, segsze, &lusup[luptr], tempv );
+
+ 		luptr += segsze;  /* Dense matrix-vector */
+		tempv1 = &tempv[segsze];
+		smatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1);
+#endif
+		
+		
+                /* Scatter tempv[] into SPA dense[] as a temporary storage */
+                isub = lptr + no_zeros;
+                for (i = 0; i < segsze; i++) {
+                    irow = lsub[isub];
+                    dense[irow] = tempv[i];
+                    tempv[i] = zero;
+                    ++isub;
+                }
+
+		/* Scatter tempv1[] into SPA dense[] */
+		for (i = 0; i < nrow; i++) {
+		    irow = lsub[isub];
+		    dense[irow] -= tempv1[i];
+		    tempv1[i] = zero;
+		    ++isub;
+		}
+	    }
+	    
+	} /* if jsupno ... */
+
+    } /* for each segment... */
+
+    /*
+     *	Process the supernodal portion of L\U[*,j]
+     */
+    nextlu = xlusup[jcol];
+    fsupc = xsup[jsupno];
+
+    /* Copy the SPA dense into L\U[*,j] */
+    new_next = nextlu + xlsub[fsupc+1] - xlsub[fsupc];
+    while ( new_next > nzlumax ) {
+	if ((mem_error = sLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu)))
+	    return (mem_error);
+	lusup = Glu->lusup;
+	lsub = Glu->lsub;
+    }
+
+    for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) {
+  	irow = lsub[isub];
+	lusup[nextlu] = dense[irow];
+        dense[irow] = zero;
+	++nextlu;
+    }
+
+    xlusup[jcolp1] = nextlu;	/* Close L\U[*,jcol] */
+
+    /* For more updates within the panel (also within the current supernode), 
+     * should start from the first column of the panel, or the first column 
+     * of the supernode, whichever is bigger. There are 2 cases:
+     *    1) fsupc < fpanelc, then fst_col := fpanelc
+     *    2) fsupc >= fpanelc, then fst_col := fsupc
+     */
+    fst_col = SUPERLU_MAX ( fsupc, fpanelc );
+
+    if ( fst_col < jcol ) {
+
+  	/* Distance between the current supernode and the current panel.
+	   d_fsupc=0 if fsupc >= fpanelc. */
+  	d_fsupc = fst_col - fsupc;
+
+	lptr = xlsub[fsupc] + d_fsupc;
+	luptr = xlusup[fst_col] + d_fsupc;
+	nsupr = xlsub[fsupc+1] - xlsub[fsupc];	/* Leading dimension */
+	nsupc = jcol - fst_col;	/* Excluding jcol */
+	nrow = nsupr - d_fsupc - nsupc;
+
+	/* Points to the beginning of jcol in snode L\U(jsupno) */
+	ufirst = xlusup[jcol] + d_fsupc;	
+
+	ops[TRSV] += nsupc * (nsupc - 1);
+	ops[GEMV] += 2 * nrow * nsupc;
+	
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+	STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], 
+	       &nsupr, &lusup[ufirst], &incx );
+#else
+	strsv_( "L", "N", "U", &nsupc, &lusup[luptr], 
+	       &nsupr, &lusup[ufirst], &incx );
+#endif
+	
+	alpha = none; beta = one; /* y := beta*y + alpha*A*x */
+
+#ifdef _CRAY
+	SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
+	       &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
+#else
+	sgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
+	       &lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
+#endif
+#else
+	slsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] );
+
+	smatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc],
+		&lusup[ufirst], tempv );
+	
+        /* Copy updates from tempv[*] into lusup[*] */
+	isub = ufirst + nsupc;
+	for (i = 0; i < nrow; i++) {
+	    lusup[isub] -= tempv[i];
+	    tempv[i] = 0.0;
+	    ++isub;
+	}
+
+#endif
+	
+	
+    } /* if fst_col < jcol ... */ 
+
+    return 0;
+}
diff --git a/intern/opennl/superlu/scolumn_dfs.c b/intern/opennl/superlu/scolumn_dfs.c
new file mode 100644
index 00000000000..ecfb5c3b839
--- /dev/null
+++ b/intern/opennl/superlu/scolumn_dfs.c
@@ -0,0 +1,270 @@
+
+
+/*
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ */
+/*
+  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ 
+  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ 
+  Permission is hereby granted to use or copy this program for any
+  purpose, provided the above notices are retained on all copies.
+  Permission to modify the code and to distribute modified code is
+  granted, provided the above notices are retained, and a notice that
+  the code was modified is included with the above copyright notice.
+*/
+
+#include "ssp_defs.h"
+
+/* What type of supernodes we want */
+#define T2_SUPER
+
+int
+scolumn_dfs(
+	   const int  m,         /* in - number of rows in the matrix */
+	   const int  jcol,      /* in */
+	   int        *perm_r,   /* in */
+	   int        *nseg,     /* modified - with new segments appended */
+	   int        *lsub_col, /* in - defines the RHS vector to start the dfs */
+	   int        *segrep,   /* modified - with new segments appended */
+	   int        *repfnz,   /* modified */
+	   int        *xprune,   /* modified */
+	   int        *marker,   /* modified */
+	   int        *parent,	 /* working array */
+	   int        *xplore,   /* working array */
+	   GlobalLU_t *Glu       /* modified */
+	   )
+{
+/* 
+ * Purpose
+ * =======
+ *   "column_dfs" performs a symbolic factorization on column jcol, and
+ *   decide the supernode boundary.
+ *
+ *   This routine does not use numeric values, but only use the RHS 
+ *   row indices to start the dfs.
+ *
+ *   A supernode representative is the last column of a supernode.
+ *   The nonzeros in U[*,j] are segments that end at supernodal
+ *   representatives. The routine returns a list of such supernodal 
+ *   representatives in topological order of the dfs that generates them.
+ *   The location of the first nonzero in each such supernodal segment
+ *   (supernodal entry location) is also returned.
+ *
+ * Local parameters
+ * ================
+ *   nseg: no of segments in current U[*,j]
+ *   jsuper: jsuper=EMPTY if column j does not belong to the same
+ *	supernode as j-1. Otherwise, jsuper=nsuper.
+ *
+ *   marker2: A-row --> A-row/col (0/1)
+ *   repfnz: SuperA-col --> PA-row
+ *   parent: SuperA-col --> SuperA-col
+ *   xplore: SuperA-col --> index to L-structure
+ *
+ * Return value
+ * ============
+ *     0  success;
+ *   > 0  number of bytes allocated when run out of space.
+ *
+ */
+    int     jcolp1, jcolm1, jsuper, nsuper, nextl;
+    int     k, krep, krow, kmark, kperm;
+    int     *marker2;           /* Used for small panel LU */
+    int	    fsupc;		/* First column of a snode */
+    int     myfnz;		/* First nonz column of a U-segment */
+    int	    chperm, chmark, chrep, kchild;
+    int     xdfs, maxdfs, kpar, oldrep;
+    int     jptr, jm1ptr;
+    int     ito, ifrom, istop;	/* Used to compress row subscripts */
+    int     mem_error;
+    int     *xsup, *supno, *lsub, *xlsub;
+    int     nzlmax;
+    static  int  first = 1, maxsuper;
+    
+    xsup    = Glu->xsup;
+    supno   = Glu->supno;
+    lsub    = Glu->lsub;
+    xlsub   = Glu->xlsub;
+    nzlmax  = Glu->nzlmax;
+
+    if ( first ) {
+	maxsuper = sp_ienv(3);
+	first = 0;
+    }
+    jcolp1  = jcol + 1;
+    jcolm1  = jcol - 1;
+    nsuper  = supno[jcol];
+    jsuper  = nsuper;
+    nextl   = xlsub[jcol];
+    marker2 = &marker[2*m];
+
+
+    /* For each nonzero in A[*,jcol] do dfs */
+    for (k = 0; lsub_col[k] != EMPTY; k++) {
+
+	krow = lsub_col[k];
+    	lsub_col[k] = EMPTY;
+	kmark = marker2[krow];    	
+
+	/* krow was visited before, go to the next nonz */
+        if ( kmark == jcol ) continue; 
+
+	/* For each unmarked nbr krow of jcol
+	 *	krow is in L: place it in structure of L[*,jcol]
+	 */
+	marker2[krow] = jcol;
+	kperm = perm_r[krow];
+
+   	if ( kperm == EMPTY ) {
+	    lsub[nextl++] = krow; 	/* krow is indexed into A */
+	    if ( nextl >= nzlmax ) {
+		if ((mem_error = sLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu)))
+		    return (mem_error);
+		lsub = Glu->lsub;
+	    }
+            if ( kmark != jcolm1 ) jsuper = EMPTY;/* Row index subset testing */
+  	} else {
+	    /*	krow is in U: if its supernode-rep krep
+	     *	has been explored, update repfnz[*]
+	     */
+	    krep = xsup[supno[kperm]+1] - 1;
+	    myfnz = repfnz[krep];
+
+	    if ( myfnz != EMPTY ) {	/* Visited before */
+	    	if ( myfnz > kperm ) repfnz[krep] = kperm;
+		/* continue; */
+	    }
+	    else {
+		/* Otherwise, perform dfs starting at krep */
+		oldrep = EMPTY;
+	 	parent[krep] = oldrep;
+	  	repfnz[krep] = kperm;
+		xdfs = xlsub[krep];
+	  	maxdfs = xprune[krep];
+
+		do {
+		    /* 
+		     * For each unmarked kchild of krep 
+		     */
+		    while ( xdfs < maxdfs ) {
+
+		   	kchild = lsub[xdfs];
+			xdfs++;
+		  	chmark = marker2[kchild];
+
+		   	if ( chmark != jcol ) { /* Not reached yet */
+		   	    marker2[kchild] = jcol;
+		   	    chperm = perm_r[kchild];
+
+		   	    /* Case kchild is in L: place it in L[*,k] */
+		   	    if ( chperm == EMPTY ) {
+			    	lsub[nextl++] = kchild;
+				if ( nextl >= nzlmax ) {
+				    if ((mem_error =
+					 sLUMemXpand(jcol,nextl,LSUB,&nzlmax,Glu)))
+					return (mem_error);
+				    lsub = Glu->lsub;
+				}
+				if ( chmark != jcolm1 ) jsuper = EMPTY;
+			    } else {
+		    	    	/* Case kchild is in U: 
+				 *   chrep = its supernode-rep. If its rep has 
+			         *   been explored, update its repfnz[*]
+			         */
+		   	    	chrep = xsup[supno[chperm]+1] - 1;
+		   		myfnz = repfnz[chrep];
+		   		if ( myfnz != EMPTY ) { /* Visited before */
+				    if ( myfnz > chperm )
+     				  	repfnz[chrep] = chperm;
+				} else {
+		        	    /* Continue dfs at super-rep of kchild */
+		   		    xplore[krep] = xdfs;	
+		   		    oldrep = krep;
+		   		    krep = chrep; /* Go deeper down G(L^t) */
+				    parent[krep] = oldrep;
+		    		    repfnz[krep] = chperm;
+		   		    xdfs = xlsub[krep];     
+				    maxdfs = xprune[krep];
+				} /* else */
+
+			   } /* else */
+
+			} /* if */
+
+		    } /* while */
+
+		    /* krow has no more unexplored nbrs;
+	   	     *    place supernode-rep krep in postorder DFS.
+	   	     *    backtrack dfs to its parent
+		     */
+		    segrep[*nseg] = krep;
+		    ++(*nseg);
+		    kpar = parent[krep]; /* Pop from stack, mimic recursion */
+		    if ( kpar == EMPTY ) break; /* dfs done */
+		    krep = kpar;
+		    xdfs = xplore[krep];
+		    maxdfs = xprune[krep];
+
+		} while ( kpar != EMPTY ); 	/* Until empty stack */
+
+	    } /* else */
+
+	} /* else */
+
+    } /* for each nonzero ... */
+
+    /* Check to see if j belongs in the same supernode as j-1 */
+    if ( jcol == 0 ) { /* Do nothing for column 0 */
+	nsuper = supno[0] = 0;
+    } else {
+   	fsupc = xsup[nsuper];
+	jptr = xlsub[jcol];	/* Not compressed yet */
+	jm1ptr = xlsub[jcolm1];
+
+#ifdef T2_SUPER
+	if ( (nextl-jptr != jptr-jm1ptr-1) ) jsuper = EMPTY;
+#endif
+	/* Make sure the number of columns in a supernode doesn't
+	   exceed threshold. */
+	if ( jcol - fsupc >= maxsuper ) jsuper = EMPTY;
+
+	/* If jcol starts a new supernode, reclaim storage space in
+	 * lsub from the previous supernode. Note we only store
+	 * the subscript set of the first and last columns of
+   	 * a supernode. (first for num values, last for pruning)
+	 */
+	if ( jsuper == EMPTY ) {	/* starts a new supernode */
+	    if ( (fsupc < jcolm1-1) ) {	/* >= 3 columns in nsuper */
+#ifdef CHK_COMPRESS
+		printf("  Compress lsub[] at super %d-%d\n", fsupc, jcolm1);
+#endif
+	        ito = xlsub[fsupc+1];
+		xlsub[jcolm1] = ito;
+		istop = ito + jptr - jm1ptr;
+		xprune[jcolm1] = istop; /* Initialize xprune[jcol-1] */
+		xlsub[jcol] = istop;
+		for (ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito)
+		    lsub[ito] = lsub[ifrom];
+		nextl = ito;            /* = istop + length(jcol) */
+	    }
+	    nsuper++;
+	    supno[jcol] = nsuper;
+	} /* if a new supernode */
+
+    }	/* else: jcol > 0 */ 
+    
+    /* Tidy up the pointers before exit */
+    xsup[nsuper+1] = jcolp1;
+    supno[jcolp1]  = nsuper;
+    xprune[jcol]   = nextl;	/* Initialize upper bound for pruning */
+    xlsub[jcolp1]  = nextl;
+
+    return 0;
+}
diff --git a/intern/opennl/superlu/scopy_to_ucol.c b/intern/opennl/superlu/scopy_to_ucol.c
new file mode 100644
index 00000000000..fd97352923f
--- /dev/null
+++ b/intern/opennl/superlu/scopy_to_ucol.c
@@ -0,0 +1,105 @@
+
+
+/*
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ */
+/*
+  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ 
+  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ 
+  Permission is hereby granted to use or copy this program for any
+  purpose, provided the above notices are retained on all copies.
+  Permission to modify the code and to distribute modified code is
+  granted, provided the above notices are retained, and a notice that
+  the code was modified is included with the above copyright notice.
+*/
+
+#include "ssp_defs.h"
+#include "util.h"
+
+int
+scopy_to_ucol(
+	      int        jcol,	  /* in */
+	      int        nseg,	  /* in */
+	      int        *segrep,  /* in */
+	      int        *repfnz,  /* in */
+	      int        *perm_r,  /* in */
+	      float     *dense,   /* modified - reset to zero on return */
+	      GlobalLU_t *Glu      /* modified */
+	      )
+{
+/* 
+ * Gather from SPA dense[*] to global ucol[*].
+ */
+    int ksub, krep, ksupno;
+    int i, k, kfnz, segsze;
+    int fsupc, isub, irow;
+    int jsupno, nextu;
+    int new_next, mem_error;
+    int       *xsup, *supno;
+    int       *lsub, *xlsub;
+    float    *ucol;
+    int       *usub, *xusub;
+    int       nzumax;
+
+    float zero = 0.0;
+
+    xsup    = Glu->xsup;
+    supno   = Glu->supno;
+    lsub    = Glu->lsub;
+    xlsub   = Glu->xlsub;
+    ucol    = Glu->ucol;
+    usub    = Glu->usub;
+    xusub   = Glu->xusub;
+    nzumax  = Glu->nzumax;
+    
+    jsupno = supno[jcol];
+    nextu  = xusub[jcol];
+    k = nseg - 1;
+    for (ksub = 0; ksub < nseg; ksub++) {
+	krep = segrep[k--];
+	ksupno = supno[krep];
+
+	if ( ksupno != jsupno ) { /* Should go into ucol[] */
+	    kfnz = repfnz[krep];
+	    if ( kfnz != EMPTY ) {	/* Nonzero U-segment */
+
+	    	fsupc = xsup[ksupno];
+	        isub = xlsub[fsupc] + kfnz - fsupc;
+	        segsze = krep - kfnz + 1;
+
+		new_next = nextu + segsze;
+		while ( new_next > nzumax ) {
+		    if ((mem_error = sLUMemXpand(jcol, nextu, UCOL, &nzumax, Glu)))
+			return (mem_error);
+		    ucol = Glu->ucol;
+		    if ((mem_error = sLUMemXpand(jcol, nextu, USUB, &nzumax, Glu)))
+			return (mem_error);
+		    usub = Glu->usub;
+		    lsub = Glu->lsub;
+		}
+		
+		for (i = 0; i < segsze; i++) {
+		    irow = lsub[isub];
+		    usub[nextu] = perm_r[irow];
+		    ucol[nextu] = dense[irow];
+		    dense[irow] = zero;
+		    nextu++;
+		    isub++;
+		} 
+
+	    }
+
+	}
+
+    } /* for each segment... */
+
+    xusub[jcol + 1] = nextu;      /* Close U[*,jcol] */
+    return 0;
+}
diff --git a/intern/opennl/superlu/sgssv.c b/intern/opennl/superlu/sgssv.c
new file mode 100644
index 00000000000..ede3dc83907
--- /dev/null
+++ b/intern/opennl/superlu/sgssv.c
@@ -0,0 +1,221 @@
+
+
+/*
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ */
+#include "ssp_defs.h"
+
+void
+sgssv(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r,
+      SuperMatrix *L, SuperMatrix *U, SuperMatrix *B,
+      SuperLUStat_t *stat, int *info )
+{
+/*
+ * Purpose
+ * =======
+ *
+ * SGSSV solves the system of linear equations A*X=B, using the
+ * LU factorization from SGSTRF. It performs the following steps:
+ *
+ *   1. If A is stored column-wise (A->Stype = SLU_NC):
+ *
+ *      1.1. Permute the columns of A, forming A*Pc, where Pc
+ *           is a permutation matrix. For more details of this step, 
+ *           see sp_preorder.c.
+ *
+ *      1.2. Factor A as Pr*A*Pc=L*U with the permutation Pr determined
+ *           by Gaussian elimination with partial pivoting.
+ *           L is unit lower triangular with offdiagonal entries
+ *           bounded by 1 in magnitude, and U is upper triangular.
+ *
+ *      1.3. Solve the system of equations A*X=B using the factored
+ *           form of A.
+ *
+ *   2. If A is stored row-wise (A->Stype = SLU_NR), apply the
+ *      above algorithm to the transpose of A:
+ *
+ *      2.1. Permute columns of transpose(A) (rows of A),
+ *           forming transpose(A)*Pc, where Pc is a permutation matrix. 
+ *           For more details of this step, see sp_preorder.c.
+ *
+ *      2.2. Factor A as Pr*transpose(A)*Pc=L*U with the permutation Pr
+ *           determined by Gaussian elimination with partial pivoting.
+ *           L is unit lower triangular with offdiagonal entries
+ *           bounded by 1 in magnitude, and U is upper triangular.
+ *
+ *      2.3. Solve the system of equations A*X=B using the factored
+ *           form of A.
+ *
+ *   See supermatrix.h for the definition of 'SuperMatrix' structure.
+ * 
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed and how the
+ *         system will be solved.
+ *
+ * A       (input) SuperMatrix*
+ *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
+ *         of linear equations is A->nrow. Currently, the type of A can be:
+ *         Stype = SLU_NC or SLU_NR; Dtype = SLU_S; Mtype = SLU_GE.
+ *         In the future, more general A may be handled.
+ *
+ * perm_c  (input/output) int*
+ *         If A->Stype = SLU_NC, column permutation vector of size A->ncol
+ *         which defines the permutation matrix Pc; perm_c[i] = j means 
+ *         column i of A is in position j in A*Pc.
+ *         If A->Stype = SLU_NR, column permutation vector of size A->nrow
+ *         which describes permutation of columns of transpose(A) 
+ *         (rows of A) as described above.
+ * 
+ *         If options->ColPerm = MY_PERMC or options->Fact = SamePattern or
+ *            options->Fact = SamePattern_SameRowPerm, it is an input argument.
+ *            On exit, perm_c may be overwritten by the product of the input
+ *            perm_c and a permutation that postorders the elimination tree
+ *            of Pc'*A'*A*Pc; perm_c is not changed if the elimination tree
+ *            is already in postorder.
+ *         Otherwise, it is an output argument.
+ * 
+ * perm_r  (input/output) int*
+ *         If A->Stype = SLU_NC, row permutation vector of size A->nrow, 
+ *         which defines the permutation matrix Pr, and is determined 
+ *         by partial pivoting.  perm_r[i] = j means row i of A is in 
+ *         position j in Pr*A.
+ *         If A->Stype = SLU_NR, permutation vector of size A->ncol, which
+ *         determines permutation of rows of transpose(A)
+ *         (columns of A) as described above.
+ *
+ *         If options->RowPerm = MY_PERMR or
+ *            options->Fact = SamePattern_SameRowPerm, perm_r is an
+ *            input argument.
+ *         otherwise it is an output argument.
+ *
+ * L       (output) SuperMatrix*
+ *         The factor L from the factorization 
+ *             Pr*A*Pc=L*U              (if A->Stype = SLU_NC) or
+ *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
+ *         Uses compressed row subscripts storage for supernodes, i.e.,
+ *         L has types: Stype = SLU_SC, Dtype = SLU_S, Mtype = SLU_TRLU.
+ *         
+ * U       (output) SuperMatrix*
+ *	   The factor U from the factorization 
+ *             Pr*A*Pc=L*U              (if A->Stype = SLU_NC) or
+ *             Pr*transpose(A)*Pc=L*U   (if A->Stype = SLU_NR).
+ *         Uses column-wise storage scheme, i.e., U has types:
+ *         Stype = SLU_NC, Dtype = SLU_S, Mtype = SLU_TRU.
+ *
+ * B       (input/output) SuperMatrix*
+ *         B has types: Stype = SLU_DN, Dtype = SLU_S, Mtype = SLU_GE.
+ *         On entry, the right hand side matrix.
+ *         On exit, the solution matrix if info = 0;
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *	   = 0: successful exit
+ *         > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                so the solution could not be computed.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol.
+ *   
+ */
+    DNformat *Bstore;
+    SuperMatrix *AA = NULL;/* A in SLU_NC format used by the factorization routine.*/
+    SuperMatrix AC; /* Matrix postmultiplied by Pc */
+    int      lwork = 0, *etree, i;
+    
+    /* Set default values for some parameters */
+    int      panel_size;     /* panel size */
+    int      relax;          /* no of columns in a relaxed snodes */
+    int      permc_spec;
+    trans_t  trans = NOTRANS;
+    double   *utime;
+    double   t;	/* Temporary time */
+
+    /* Test the input parameters ... */
+    *info = 0;
+    Bstore = B->Store;
+    if ( options->Fact != DOFACT ) *info = -1;
+    else if ( A->nrow != A->ncol || A->nrow < 0 ||
+	 (A->Stype != SLU_NC && A->Stype != SLU_NR) ||
+	 A->Dtype != SLU_S || A->Mtype != SLU_GE )
+	*info = -2;
+    else if ( B->ncol < 0 || Bstore->lda < SUPERLU_MAX(0, A->nrow) ||
+	B->Stype != SLU_DN || B->Dtype != SLU_S || B->Mtype != SLU_GE )
+	*info = -7;
+    if ( *info != 0 ) {
+	i = -(*info);
+	xerbla_("sgssv", &i);
+	return;
+    }
+
+    utime = stat->utime;
+
+    /* Convert A to SLU_NC format when necessary. */
+    if ( A->Stype == SLU_NR ) {
+	NRformat *Astore = A->Store;
+	AA = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) );
+	sCreate_CompCol_Matrix(AA, A->ncol, A->nrow, Astore->nnz, 
+			       Astore->nzval, Astore->colind, Astore->rowptr,
+			       SLU_NC, A->Dtype, A->Mtype);
+	trans = TRANS;
+    } else {
+        if ( A->Stype == SLU_NC ) AA = A;
+    }
+
+    t = SuperLU_timer_();
+    /*
+     * Get column permutation vector perm_c[], according to permc_spec:
+     *   permc_spec = NATURAL:  natural ordering 
+     *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
+     *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
+     *   permc_spec = COLAMD:   approximate minimum degree column ordering
+     *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
+     */
+    permc_spec = options->ColPerm;
+    if ( permc_spec != MY_PERMC && options->Fact == DOFACT )
+      get_perm_c(permc_spec, AA, perm_c);
+    utime[COLPERM] = SuperLU_timer_() - t;
+
+    etree = intMalloc(A->ncol);
+
+    t = SuperLU_timer_();
+    sp_preorder(options, AA, perm_c, etree, &AC);
+    utime[ETREE] = SuperLU_timer_() - t;
+
+    panel_size = sp_ienv(1);
+    relax = sp_ienv(2);
+
+    /*printf("Factor PA = LU ... relax %d\tw %d\tmaxsuper %d\trowblk %d\n", 
+	  relax, panel_size, sp_ienv(3), sp_ienv(4));*/
+    t = SuperLU_timer_(); 
+    /* Compute the LU factorization of A. */
+    sgstrf(options, &AC, relax, panel_size,
+	   etree, NULL, lwork, perm_c, perm_r, L, U, stat, info);
+    utime[FACT] = SuperLU_timer_() - t;
+
+    t = SuperLU_timer_();
+    if ( *info == 0 ) {
+        /* Solve the system A*X=B, overwriting B with X. */
+        sgstrs (trans, L, U, perm_c, perm_r, B, stat, info);
+    }
+    utime[SOLVE] = SuperLU_timer_() - t;
+
+    SUPERLU_FREE (etree);
+    Destroy_CompCol_Permuted(&AC);
+    if ( A->Stype == SLU_NR ) {
+	Destroy_SuperMatrix_Store(AA);
+	SUPERLU_FREE(AA);
+    }
+
+}
diff --git a/intern/opennl/superlu/sgstrf.c b/intern/opennl/superlu/sgstrf.c
new file mode 100644
index 00000000000..42f8dc9d0ee
--- /dev/null
+++ b/intern/opennl/superlu/sgstrf.c
@@ -0,0 +1,433 @@
+
+/*
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ */
+/*
+  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ 
+  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ 
+  Permission is hereby granted to use or copy this program for any
+  purpose, provided the above notices are retained on all copies.
+  Permission to modify the code and to distribute modified code is
+  granted, provided the above notices are retained, and a notice that
+  the code was modified is included with the above copyright notice.
+*/
+
+#include "ssp_defs.h"
+
+void
+sgstrf (superlu_options_t *options, SuperMatrix *A,
+        int relax, int panel_size, int *etree, void *work, int lwork,
+        int *perm_c, int *perm_r, SuperMatrix *L, SuperMatrix *U,
+        SuperLUStat_t *stat, int *info)
+{
+/*
+ * Purpose
+ * =======
+ *
+ * SGSTRF computes an LU factorization of a general sparse m-by-n
+ * matrix A using partial pivoting with row interchanges.
+ * The factorization has the form
+ *     Pr * A = L * U
+ * where Pr is a row permutation matrix, L is lower triangular with unit
+ * diagonal elements (lower trapezoidal if A->nrow > A->ncol), and U is upper 
+ * triangular (upper trapezoidal if A->nrow < A->ncol).
+ *
+ * See supermatrix.h for the definition of 'SuperMatrix' structure.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *
+ * A        (input) SuperMatrix*
+ *	    Original matrix A, permuted by columns, of dimension
+ *          (A->nrow, A->ncol). The type of A can be:
+ *          Stype = SLU_NCP; Dtype = SLU_S; Mtype = SLU_GE.
+ *
+ * drop_tol (input) float (NOT IMPLEMENTED)
+ *	    Drop tolerance parameter. At step j of the Gaussian elimination,
+ *          if abs(A_ij)/(max_i abs(A_ij)) < drop_tol, drop entry A_ij.
+ *          0 <= drop_tol <= 1. The default value of drop_tol is 0.
+ *
+ * relax    (input) int
+ *          To control degree of relaxing supernodes. If the number
+ *          of nodes (columns) in a subtree of the elimination tree is less
+ *          than relax, this subtree is considered as one supernode,
+ *          regardless of the row structures of those columns.
+ *
+ * panel_size (input) int
+ *          A panel consists of at most panel_size consecutive columns.
+ *
+ * etree    (input) int*, dimension (A->ncol)
+ *          Elimination tree of A'*A.
+ *          Note: etree is a vector of parent pointers for a forest whose
+ *          vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *          On input, the columns of A should be permuted so that the
+ *          etree is in a certain postorder.
+ *
+ * work     (input/output) void*, size (lwork) (in bytes)
+ *          User-supplied work space and space for the output data structures.
+ *          Not referenced if lwork = 0;
+ *
+ * lwork   (input) int
+ *         Specifies the size of work array in bytes.
+ *         = 0:  allocate space internally by system malloc;
+ *         > 0:  use user-supplied work array of length lwork in bytes,
+ *               returns error if space runs out.
+ *         = -1: the routine guesses the amount of space needed without
+ *               performing the factorization, and returns it in
+ *               *info; no other side effects.
+ *
+ * perm_c   (input) int*, dimension (A->ncol)
+ *	    Column permutation vector, which defines the 
+ *          permutation matrix Pc; perm_c[i] = j means column i of A is 
+ *          in position j in A*Pc.
+ *          When searching for diagonal, perm_c[*] is applied to the
+ *          row subscripts of A, so that diagonal threshold pivoting
+ *          can find the diagonal of A, rather than that of A*Pc.
+ *
+ * perm_r   (input/output) int*, dimension (A->nrow)
+ *          Row permutation vector which defines the permutation matrix Pr,
+ *          perm_r[i] = j means row i of A is in position j in Pr*A.
+ *          If options->Fact = SamePattern_SameRowPerm, the pivoting routine
+ *             will try to use the input perm_r, unless a certain threshold
+ *             criterion is violated. In that case, perm_r is overwritten by
+ *             a new permutation determined by partial pivoting or diagonal
+ *             threshold pivoting.
+ *          Otherwise, perm_r is output argument;
+ *
+ * L        (output) SuperMatrix*
+ *          The factor L from the factorization Pr*A=L*U; use compressed row 
+ *          subscripts storage for supernodes, i.e., L has type: 
+ *          Stype = SLU_SC, Dtype = SLU_S, Mtype = SLU_TRLU.
+ *
+ * U        (output) SuperMatrix*
+ *	    The factor U from the factorization Pr*A*Pc=L*U. Use column-wise
+ *          storage scheme, i.e., U has types: Stype = SLU_NC, 
+ *          Dtype = SLU_S, Mtype = SLU_TRU.
+ *
+ * stat     (output) SuperLUStat_t*
+ *          Record the statistics on runtime and floating-point operation count.
+ *          See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info     (output) int*
+ *          = 0: successful exit
+ *          < 0: if info = -i, the i-th argument had an illegal value
+ *          > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                and division by zero will occur if it is used to solve a
+ *                system of equations.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol. If lwork = -1, it is
+ *                the estimated amount of space needed, plus A->ncol.
+ *
+ * ======================================================================
+ *
+ * Local Working Arrays: 
+ * ======================
+ *   m = number of rows in the matrix
+ *   n = number of columns in the matrix
+ *
+ *   xprune[0:n-1]: xprune[*] points to locations in subscript 
+ *	vector lsub[*]. For column i, xprune[i] denotes the point where 
+ *	structural pruning begins. I.e. only xlsub[i],..,xprune[i]-1 need 
+ *	to be traversed for symbolic factorization.
+ *
+ *   marker[0:3*m-1]: marker[i] = j means that node i has been 
+ *	reached when working on column j.
+ *	Storage: relative to original row subscripts
+ *	NOTE: There are 3 of them: marker/marker1 are used for panel dfs, 
+ *	      see spanel_dfs.c; marker2 is used for inner-factorization,
+ *            see scolumn_dfs.c.
+ *
+ *   parent[0:m-1]: parent vector used during dfs
+ *      Storage: relative to new row subscripts
+ *
+ *   xplore[0:m-1]: xplore[i] gives the location of the next (dfs) 
+ *	unexplored neighbor of i in lsub[*]
+ *
+ *   segrep[0:nseg-1]: contains the list of supernodal representatives
+ *	in topological order of the dfs. A supernode representative is the 
+ *	last column of a supernode.
+ *      The maximum size of segrep[] is n.
+ *
+ *   repfnz[0:W*m-1]: for a nonzero segment U[*,j] that ends at a 
+ *	supernodal representative r, repfnz[r] is the location of the first 
+ *	nonzero in this segment.  It is also used during the dfs: repfnz[r]>0
+ *	indicates the supernode r has been explored.
+ *	NOTE: There are W of them, each used for one column of a panel. 
+ *
+ *   panel_lsub[0:W*m-1]: temporary for the nonzeros row indices below 
+ *      the panel diagonal. These are filled in during spanel_dfs(), and are
+ *      used later in the inner LU factorization within the panel.
+ *	panel_lsub[]/dense[] pair forms the SPA data structure.
+ *	NOTE: There are W of them.
+ *
+ *   dense[0:W*m-1]: sparse accumulating (SPA) vector for intermediate values;
+ *	    	   NOTE: there are W of them.
+ *
+ *   tempv[0:*]: real temporary used for dense numeric kernels;
+ *	The size of this array is defined by NUM_TEMPV() in ssp_defs.h.
+ *
+ */
+    /* Local working arrays */
+    NCPformat *Astore;
+    int       *iperm_r = NULL; /* inverse of perm_r;
+			   used when options->Fact == SamePattern_SameRowPerm */
+    int       *iperm_c; /* inverse of perm_c */
+    int       *iwork;
+    float    *swork;
+    int	      *segrep, *repfnz, *parent, *xplore;
+    int	      *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */
+    int	      *xprune;
+    int	      *marker;
+    float    *dense, *tempv;
+    int       *relax_end;
+    float    *a;
+    int       *asub;
+    int       *xa_begin, *xa_end;
+    int       *xsup, *supno;
+    int       *xlsub, *xlusup, *xusub;
+    int       nzlumax;
+    static GlobalLU_t Glu; /* persistent to facilitate multiple factors. */
+
+    /* Local scalars */
+    fact_t    fact = options->Fact;
+    double    diag_pivot_thresh = options->DiagPivotThresh;
+    int       pivrow;   /* pivotal row number in the original matrix A */
+    int       nseg1;	/* no of segments in U-column above panel row jcol */
+    int       nseg;	/* no of segments in each U-column */
+    register int jcol;	
+    register int kcol;	/* end column of a relaxed snode */
+    register int icol;
+    register int i, k, jj, new_next, iinfo;
+    int       m, n, min_mn, jsupno, fsupc, nextlu, nextu;
+    int       w_def;	/* upper bound on panel width */
+    int       usepr, iperm_r_allocated = 0;
+    int       nnzL, nnzU;
+    int       *panel_histo = stat->panel_histo;
+    flops_t   *ops = stat->ops;
+
+    iinfo    = 0;
+    m        = A->nrow;
+    n        = A->ncol;
+    min_mn   = SUPERLU_MIN(m, n);
+    Astore   = A->Store;
+    a        = Astore->nzval;
+    asub     = Astore->rowind;
+    xa_begin = Astore->colbeg;
+    xa_end   = Astore->colend;
+
+    /* Allocate storage common to the factor routines */
+    *info = sLUMemInit(fact, work, lwork, m, n, Astore->nnz,
+                       panel_size, L, U, &Glu, &iwork, &swork);
+    if ( *info ) return;
+    
+    xsup    = Glu.xsup;
+    supno   = Glu.supno;
+    xlsub   = Glu.xlsub;
+    xlusup  = Glu.xlusup;
+    xusub   = Glu.xusub;
+    
+    SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore,
+	     &repfnz, &panel_lsub, &xprune, &marker);
+    sSetRWork(m, panel_size, swork, &dense, &tempv);
+    
+    usepr = (fact == SamePattern_SameRowPerm);
+    if ( usepr ) {
+	/* Compute the inverse of perm_r */
+	iperm_r = (int *) intMalloc(m);
+	for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k;
+	iperm_r_allocated = 1;
+    }
+    iperm_c = (int *) intMalloc(n);
+    for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k;
+
+    /* Identify relaxed snodes */
+    relax_end = (int *) intMalloc(n);
+    if ( options->SymmetricMode == YES ) {
+        heap_relax_snode(n, etree, relax, marker, relax_end); 
+    } else {
+        relax_snode(n, etree, relax, marker, relax_end); 
+    }
+    
+    ifill (perm_r, m, EMPTY);
+    ifill (marker, m * NO_MARKER, EMPTY);
+    supno[0] = -1;
+    xsup[0]  = xlsub[0] = xusub[0] = xlusup[0] = 0;
+    w_def    = panel_size;
+
+    /* 
+     * Work on one "panel" at a time. A panel is one of the following: 
+     *	   (a) a relaxed supernode at the bottom of the etree, or
+     *	   (b) panel_size contiguous columns, defined by the user
+     */
+    for (jcol = 0; jcol < min_mn; ) {
+
+	if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */
+   	    kcol = relax_end[jcol];	  /* end of the relaxed snode */
+	    panel_histo[kcol-jcol+1]++;
+
+	    /* --------------------------------------
+	     * Factorize the relaxed supernode(jcol:kcol) 
+	     * -------------------------------------- */
+	    /* Determine the union of the row structure of the snode */
+	    if ( (*info = ssnode_dfs(jcol, kcol, asub, xa_begin, xa_end,
+				    xprune, marker, &Glu)) != 0 )
+		return;
+
+            nextu    = xusub[jcol];
+	    nextlu   = xlusup[jcol];
+	    jsupno   = supno[jcol];
+	    fsupc    = xsup[jsupno];
+	    new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1);
+	    nzlumax = Glu.nzlumax;
+	    while ( new_next > nzlumax ) {
+		if ( (*info = sLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, &Glu)) )
+		    return;
+	    }
+    
+	    for (icol = jcol; icol<= kcol; icol++) {
+		xusub[icol+1] = nextu;
+		
+    		/* Scatter into SPA dense[*] */
+    		for (k = xa_begin[icol]; k < xa_end[icol]; k++)
+        	    dense[asub[k]] = a[k];
+
+	       	/* Numeric update within the snode */
+	        ssnode_bmod(icol, fsupc, dense, tempv, &Glu, stat);
+
+		if ( (*info = spivotL(icol, diag_pivot_thresh, &usepr, perm_r,
+				      iperm_r, iperm_c, &pivrow, &Glu, stat)) )
+		    if ( iinfo == 0 ) iinfo = *info;
+		
+#ifdef DEBUG
+		sprint_lu_col("[1]: ", icol, pivrow, xprune, &Glu);
+#endif
+
+	    }
+
+	    jcol = icol;
+
+	} else { /* Work on one panel of panel_size columns */
+	    
+	    /* Adjust panel_size so that a panel won't overlap with the next 
+	     * relaxed snode.
+	     */
+	    panel_size = w_def;
+	    for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++) 
+		if ( relax_end[k] != EMPTY ) {
+		    panel_size = k - jcol;
+		    break;
+		}
+	    if ( k == min_mn ) panel_size = min_mn - jcol;
+	    panel_histo[panel_size]++;
+
+	    /* symbolic factor on a panel of columns */
+	    spanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1,
+		      dense, panel_lsub, segrep, repfnz, xprune,
+		      marker, parent, xplore, &Glu);
+	    
+	    /* numeric sup-panel updates in topological order */
+	    spanel_bmod(m, panel_size, jcol, nseg1, dense,
+		        tempv, segrep, repfnz, &Glu, stat);
+	    
+	    /* Sparse LU within the panel, and below panel diagonal */
+    	    for ( jj = jcol; jj < jcol + panel_size; jj++) {
+ 		k = (jj - jcol) * m; /* column index for w-wide arrays */
+
+		nseg = nseg1;	/* Begin after all the panel segments */
+
+	    	if ((*info = scolumn_dfs(m, jj, perm_r, &nseg, &panel_lsub[k],
+					segrep, &repfnz[k], xprune, marker,
+					parent, xplore, &Glu)) != 0) return;
+
+	      	/* Numeric updates */
+	    	if ((*info = scolumn_bmod(jj, (nseg - nseg1), &dense[k],
+					 tempv, &segrep[nseg1], &repfnz[k],
+					 jcol, &Glu, stat)) != 0) return;
+		
+	        /* Copy the U-segments to ucol[*] */
+		if ((*info = scopy_to_ucol(jj, nseg, segrep, &repfnz[k],
+					  perm_r, &dense[k], &Glu)) != 0)
+		    return;
+
+	    	if ( (*info = spivotL(jj, diag_pivot_thresh, &usepr, perm_r,
+				      iperm_r, iperm_c, &pivrow, &Glu, stat)) )
+		    if ( iinfo == 0 ) iinfo = *info;
+
+		/* Prune columns (0:jj-1) using column jj */
+	    	spruneL(jj, perm_r, pivrow, nseg, segrep,
+                        &repfnz[k], xprune, &Glu);
+
+		/* Reset repfnz[] for this column */
+	    	resetrep_col (nseg, segrep, &repfnz[k]);
+		
+#ifdef DEBUG
+		sprint_lu_col("[2]: ", jj, pivrow, xprune, &Glu);
+#endif
+
+	    }
+
+   	    jcol += panel_size;	/* Move to the next panel */
+
+	} /* else */
+
+    } /* for */
+
+    *info = iinfo;
+    
+    if ( m > n ) {
+	k = 0;
+        for (i = 0; i < m; ++i) 
+            if ( perm_r[i] == EMPTY ) {
+    		perm_r[i] = n + k;
+		++k;
+	    }
+    }
+
+    countnz(min_mn, xprune, &nnzL, &nnzU, &Glu);
+    fixupL(min_mn, perm_r, &Glu);
+
+    sLUWorkFree(iwork, swork, &Glu); /* Free work space and compress storage */
+
+    if ( fact == SamePattern_SameRowPerm ) {
+        /* L and U structures may have changed due to possibly different
+	   pivoting, even though the storage is available.
+	   There could also be memory expansions, so the array locations
+           may have changed, */
+        ((SCformat *)L->Store)->nnz = nnzL;
+	((SCformat *)L->Store)->nsuper = Glu.supno[n];
+	((SCformat *)L->Store)->nzval = Glu.lusup;
+	((SCformat *)L->Store)->nzval_colptr = Glu.xlusup;
+	((SCformat *)L->Store)->rowind = Glu.lsub;
+	((SCformat *)L->Store)->rowind_colptr = Glu.xlsub;
+	((NCformat *)U->Store)->nnz = nnzU;
+	((NCformat *)U->Store)->nzval = Glu.ucol;
+	((NCformat *)U->Store)->rowind = Glu.usub;
+	((NCformat *)U->Store)->colptr = Glu.xusub;
+    } else {
+        sCreate_SuperNode_Matrix(L, A->nrow, A->ncol, nnzL, Glu.lusup, 
+	                         Glu.xlusup, Glu.lsub, Glu.xlsub, Glu.supno,
+			         Glu.xsup, SLU_SC, SLU_S, SLU_TRLU);
+    	sCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, Glu.ucol, 
+			       Glu.usub, Glu.xusub, SLU_NC, SLU_S, SLU_TRU);
+    }
+    
+    ops[FACT] += ops[TRSV] + ops[GEMV];	
+    
+    if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r);
+    SUPERLU_FREE (iperm_c);
+    SUPERLU_FREE (relax_end);
+
+}
diff --git a/intern/opennl/superlu/sgstrs.c b/intern/opennl/superlu/sgstrs.c
new file mode 100644
index 00000000000..5f7b9b57195
--- /dev/null
+++ b/intern/opennl/superlu/sgstrs.c
@@ -0,0 +1,331 @@
+
+/*
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ */
+/*
+  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ 
+  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ 
+  Permission is hereby granted to use or copy this program for any
+  purpose, provided the above notices are retained on all copies.
+  Permission to modify the code and to distribute modified code is
+  granted, provided the above notices are retained, and a notice that
+  the code was modified is included with the above copyright notice.
+*/
+
+#include "ssp_defs.h"
+
+
+/* 
+ * Function prototypes 
+ */
+void susolve(int, int, float*, float*);
+void slsolve(int, int, float*, float*);
+void smatvec(int, int, int, float*, float*, float*);
+
+
+void
+sgstrs (trans_t trans, SuperMatrix *L, SuperMatrix *U,
+        int *perm_c, int *perm_r, SuperMatrix *B,
+        SuperLUStat_t *stat, int *info)
+{
+/*
+ * Purpose
+ * =======
+ *
+ * SGSTRS solves a system of linear equations A*X=B or A'*X=B
+ * with A sparse and B dense, using the LU factorization computed by
+ * SGSTRF.
+ *
+ * See supermatrix.h for the definition of 'SuperMatrix' structure.
+ *
+ * Arguments
+ * =========
+ *
+ * trans   (input) trans_t
+ *          Specifies the form of the system of equations:
+ *          = NOTRANS: A * X = B  (No transpose)
+ *          = TRANS:   A'* X = B  (Transpose)
+ *          = CONJ:    A**H * X = B  (Conjugate transpose)
+ *
+ * L       (input) SuperMatrix*
+ *         The factor L from the factorization Pr*A*Pc=L*U as computed by
+ *         sgstrf(). Use compressed row subscripts storage for supernodes,
+ *         i.e., L has types: Stype = SLU_SC, Dtype = SLU_S, Mtype = SLU_TRLU.
+ *
+ * U       (input) SuperMatrix*
+ *         The factor U from the factorization Pr*A*Pc=L*U as computed by
+ *         sgstrf(). Use column-wise storage scheme, i.e., U has types:
+ *         Stype = SLU_NC, Dtype = SLU_S, Mtype = SLU_TRU.
+ *
+ * perm_c  (input) int*, dimension (L->ncol)
+ *	   Column permutation vector, which defines the 
+ *         permutation matrix Pc; perm_c[i] = j means column i of A is 
+ *         in position j in A*Pc.
+ *
+ * perm_r  (input) int*, dimension (L->nrow)
+ *         Row permutation vector, which defines the permutation matrix Pr; 
+ *         perm_r[i] = j means row i of A is in position j in Pr*A.
+ *
+ * B       (input/output) SuperMatrix*
+ *         B has types: Stype = SLU_DN, Dtype = SLU_S, Mtype = SLU_GE.
+ *         On entry, the right hand side matrix.
+ *         On exit, the solution matrix if info = 0;
+ *
+ * stat     (output) SuperLUStat_t*
+ *          Record the statistics on runtime and floating-point operation count.
+ *          See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ * 	   = 0: successful exit
+ *	   < 0: if info = -i, the i-th argument had an illegal value
+ *
+ */
+#ifdef _CRAY
+    _fcd ftcs1, ftcs2, ftcs3, ftcs4;
+#endif
+#ifdef USE_VENDOR_BLAS
+    float   alpha = 1.0, beta = 1.0;
+    float   *work_col;
+#endif
+    DNformat *Bstore;
+    float   *Bmat;
+    SCformat *Lstore;
+    NCformat *Ustore;
+    float   *Lval, *Uval;
+    int      fsupc, nrow, nsupr, nsupc, luptr, istart, irow;
+    int      i, j, k, iptr, jcol, n, ldb, nrhs;
+    float   *work, *rhs_work, *soln;
+    flops_t  solve_ops;
+    void sprint_soln();
+
+    /* Test input parameters ... */
+    *info = 0;
+    Bstore = B->Store;
+    ldb = Bstore->lda;
+    nrhs = B->ncol;
+    if ( trans != NOTRANS && trans != TRANS && trans != CONJ ) *info = -1;
+    else if ( L->nrow != L->ncol || L->nrow < 0 ||
+	      L->Stype != SLU_SC || L->Dtype != SLU_S || L->Mtype != SLU_TRLU )
+	*info = -2;
+    else if ( U->nrow != U->ncol || U->nrow < 0 ||
+	      U->Stype != SLU_NC || U->Dtype != SLU_S || U->Mtype != SLU_TRU )
+	*info = -3;
+    else if ( ldb < SUPERLU_MAX(0, L->nrow) ||
+	      B->Stype != SLU_DN || B->Dtype != SLU_S || B->Mtype != SLU_GE )
+	*info = -6;
+    if ( *info ) {
+	i = -(*info);
+	xerbla_("sgstrs", &i);
+	return;
+    }
+
+    n = L->nrow;
+    work = floatCalloc(n * nrhs);
+    if ( !work ) ABORT("Malloc fails for local work[].");
+    soln = floatMalloc(n);
+    if ( !soln ) ABORT("Malloc fails for local soln[].");
+
+    Bmat = Bstore->nzval;
+    Lstore = L->Store;
+    Lval = Lstore->nzval;
+    Ustore = U->Store;
+    Uval = Ustore->nzval;
+    solve_ops = 0;
+    
+    if ( trans == NOTRANS ) {
+	/* Permute right hand sides to form Pr*B */
+	for (i = 0; i < nrhs; i++) {
+	    rhs_work = &Bmat[i*ldb];
+	    for (k = 0; k < n; k++) soln[perm_r[k]] = rhs_work[k];
+	    for (k = 0; k < n; k++) rhs_work[k] = soln[k];
+	}
+	
+	/* Forward solve PLy=Pb. */
+	for (k = 0; k <= Lstore->nsuper; k++) {
+	    fsupc = L_FST_SUPC(k);
+	    istart = L_SUB_START(fsupc);
+	    nsupr = L_SUB_START(fsupc+1) - istart;
+	    nsupc = L_FST_SUPC(k+1) - fsupc;
+	    nrow = nsupr - nsupc;
+
+	    solve_ops += nsupc * (nsupc - 1) * nrhs;
+	    solve_ops += 2 * nrow * nsupc * nrhs;
+	    
+	    if ( nsupc == 1 ) {
+		for (j = 0; j < nrhs; j++) {
+		    rhs_work = &Bmat[j*ldb];
+	    	    luptr = L_NZ_START(fsupc);
+		    for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); iptr++){
+			irow = L_SUB(iptr);
+			++luptr;
+			rhs_work[irow] -= rhs_work[fsupc] * Lval[luptr];
+		    }
+		}
+	    } else {
+	    	luptr = L_NZ_START(fsupc);
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+		ftcs1 = _cptofcd("L", strlen("L"));
+		ftcs2 = _cptofcd("N", strlen("N"));
+		ftcs3 = _cptofcd("U", strlen("U"));
+		STRSM( ftcs1, ftcs1, ftcs2, ftcs3, &nsupc, &nrhs, &alpha,
+		       &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb);
+		
+		SGEMM( ftcs2, ftcs2, &nrow, &nrhs, &nsupc, &alpha, 
+			&Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, 
+			&beta, &work[0], &n );
+#else
+		strsm_("L", "L", "N", "U", &nsupc, &nrhs, &alpha,
+		       &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb);
+		
+		sgemm_( "N", "N", &nrow, &nrhs, &nsupc, &alpha, 
+			&Lval[luptr+nsupc], &nsupr, &Bmat[fsupc], &ldb, 
+			&beta, &work[0], &n );
+#endif
+		for (j = 0; j < nrhs; j++) {
+		    rhs_work = &Bmat[j*ldb];
+		    work_col = &work[j*n];
+		    iptr = istart + nsupc;
+		    for (i = 0; i < nrow; i++) {
+			irow = L_SUB(iptr);
+			rhs_work[irow] -= work_col[i]; /* Scatter */
+			work_col[i] = 0.0;
+			iptr++;
+		    }
+		}
+#else		
+		for (j = 0; j < nrhs; j++) {
+		    rhs_work = &Bmat[j*ldb];
+		    slsolve (nsupr, nsupc, &Lval[luptr], &rhs_work[fsupc]);
+		    smatvec (nsupr, nrow, nsupc, &Lval[luptr+nsupc],
+			    &rhs_work[fsupc], &work[0] );
+
+		    iptr = istart + nsupc;
+		    for (i = 0; i < nrow; i++) {
+			irow = L_SUB(iptr);
+			rhs_work[irow] -= work[i];
+			work[i] = 0.0;
+			iptr++;
+		    }
+		}
+#endif		    
+	    } /* else ... */
+	} /* for L-solve */
+
+#ifdef DEBUG
+  	printf("After L-solve: y=\n");
+	sprint_soln(n, Bmat);
+#endif
+
+	/*
+	 * Back solve Ux=y.
+	 */
+	for (k = Lstore->nsuper; k >= 0; k--) {
+	    fsupc = L_FST_SUPC(k);
+	    istart = L_SUB_START(fsupc);
+	    nsupr = L_SUB_START(fsupc+1) - istart;
+	    nsupc = L_FST_SUPC(k+1) - fsupc;
+	    luptr = L_NZ_START(fsupc);
+
+	    solve_ops += nsupc * (nsupc + 1) * nrhs;
+
+	    if ( nsupc == 1 ) {
+		rhs_work = &Bmat[0];
+		for (j = 0; j < nrhs; j++) {
+		    rhs_work[fsupc] /= Lval[luptr];
+		    rhs_work += ldb;
+		}
+	    } else {
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+		ftcs1 = _cptofcd("L", strlen("L"));
+		ftcs2 = _cptofcd("U", strlen("U"));
+		ftcs3 = _cptofcd("N", strlen("N"));
+		STRSM( ftcs1, ftcs2, ftcs3, ftcs3, &nsupc, &nrhs, &alpha,
+		       &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb);
+#else
+		strsm_("L", "U", "N", "N", &nsupc, &nrhs, &alpha,
+		       &Lval[luptr], &nsupr, &Bmat[fsupc], &ldb);
+#endif
+#else		
+		for (j = 0; j < nrhs; j++)
+		    susolve ( nsupr, nsupc, &Lval[luptr], &Bmat[fsupc+j*ldb] );
+#endif		
+	    }
+
+	    for (j = 0; j < nrhs; ++j) {
+		rhs_work = &Bmat[j*ldb];
+		for (jcol = fsupc; jcol < fsupc + nsupc; jcol++) {
+		    solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
+		    for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++ ){
+			irow = U_SUB(i);
+			rhs_work[irow] -= rhs_work[jcol] * Uval[i];
+		    }
+		}
+	    }
+	    
+	} /* for U-solve */
+
+#ifdef DEBUG
+  	printf("After U-solve: x=\n");
+	sprint_soln(n, Bmat);
+#endif
+
+	/* Compute the final solution X := Pc*X. */
+	for (i = 0; i < nrhs; i++) {
+	    rhs_work = &Bmat[i*ldb];
+	    for (k = 0; k < n; k++) soln[k] = rhs_work[perm_c[k]];
+	    for (k = 0; k < n; k++) rhs_work[k] = soln[k];
+	}
+	
+        stat->ops[SOLVE] = solve_ops;
+
+    } else { /* Solve A'*X=B or CONJ(A)*X=B */
+	/* Permute right hand sides to form Pc'*B. */
+	for (i = 0; i < nrhs; i++) {
+	    rhs_work = &Bmat[i*ldb];
+	    for (k = 0; k < n; k++) soln[perm_c[k]] = rhs_work[k];
+	    for (k = 0; k < n; k++) rhs_work[k] = soln[k];
+	}
+
+	stat->ops[SOLVE] = 0;
+	for (k = 0; k < nrhs; ++k) {
+	    
+	    /* Multiply by inv(U'). */
+	    sp_strsv("U", "T", "N", L, U, &Bmat[k*ldb], stat, info);
+	    
+	    /* Multiply by inv(L'). */
+	    sp_strsv("L", "T", "U", L, U, &Bmat[k*ldb], stat, info);
+	    
+	}
+	/* Compute the final solution X := Pr'*X (=inv(Pr)*X) */
+	for (i = 0; i < nrhs; i++) {
+	    rhs_work = &Bmat[i*ldb];
+	    for (k = 0; k < n; k++) soln[k] = rhs_work[perm_r[k]];
+	    for (k = 0; k < n; k++) rhs_work[k] = soln[k];
+	}
+
+    }
+
+    SUPERLU_FREE(work);
+    SUPERLU_FREE(soln);
+}
+
+/*
+ * Diagnostic print of the solution vector 
+ */
+void
+sprint_soln(int n, float *soln)
+{
+    int i;
+
+    for (i = 0; i < n; i++) 
+  	printf("\t%d: %.4f\n", i, soln[i]);
+}
diff --git a/intern/opennl/superlu/smemory.c b/intern/opennl/superlu/smemory.c
new file mode 100644
index 00000000000..79da748671a
--- /dev/null
+++ b/intern/opennl/superlu/smemory.c
@@ -0,0 +1,676 @@
+
+/*
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ */
+#include "ssp_defs.h"
+
+/* Constants */
+#define NO_MEMTYPE  4      /* 0: lusup;
+			      1: ucol;
+			      2: lsub;
+			      3: usub */
+#define GluIntArray(n)   (5 * (n) + 5)
+
+/* Internal prototypes */
+void  *sexpand (int *, MemType,int, int, GlobalLU_t *);
+int   sLUWorkInit (int, int, int, int **, float **, LU_space_t);
+void  copy_mem_float (int, void *, void *);
+void  sStackCompress (GlobalLU_t *);
+void  sSetupSpace (void *, int, LU_space_t *);
+void  *suser_malloc (int, int);
+void  suser_free (int, int);
+
+/* External prototypes (in memory.c - prec-indep) */
+extern void    copy_mem_int    (int, void *, void *);
+extern void    user_bcopy      (char *, char *, int);
+
+/* Headers for 4 types of dynamatically managed memory */
+typedef struct e_node {
+    int size;      /* length of the memory that has been used */
+    void *mem;     /* pointer to the new malloc'd store */
+} ExpHeader;
+
+typedef struct {
+    int  size;
+    int  used;
+    int  top1;  /* grow upward, relative to &array[0] */
+    int  top2;  /* grow downward */
+    void *array;
+} LU_stack_t;
+
+/* Variables local to this file */
+static ExpHeader *expanders = 0; /* Array of pointers to 4 types of memory */
+static LU_stack_t stack;
+static int no_expand;
+
+/* Macros to manipulate stack */
+#define StackFull(x)         ( x + stack.used >= stack.size )
+#define NotDoubleAlign(addr) ( (long int)addr & 7 )
+#define DoubleAlign(addr)    ( ((long int)addr + 7) & ~7L )
+#define TempSpace(m, w)      ( (2*w + 4 + NO_MARKER) * m * sizeof(int) + \
+			      (w + 1) * m * sizeof(float) )
+#define Reduce(alpha)        ((alpha + 1) / 2)  /* i.e. (alpha-1)/2 + 1 */
+
+
+
+
+/*
+ * Setup the memory model to be used for factorization.
+ *    lwork = 0: use system malloc;
+ *    lwork > 0: use user-supplied work[] space.
+ */
+void sSetupSpace(void *work, int lwork, LU_space_t *MemModel)
+{
+    if ( lwork == 0 ) {
+	*MemModel = SYSTEM; /* malloc/free */
+    } else if ( lwork > 0 ) {
+	*MemModel = USER;   /* user provided space */
+	stack.used = 0;
+	stack.top1 = 0;
+	stack.top2 = (lwork/4)*4; /* must be word addressable */
+	stack.size = stack.top2;
+	stack.array = (void *) work;
+    }
+}
+
+
+
+void *suser_malloc(int bytes, int which_end)
+{
+    void *buf;
+    
+    if ( StackFull(bytes) ) return (NULL);
+
+    if ( which_end == HEAD ) {
+	buf = (char*) stack.array + stack.top1;
+	stack.top1 += bytes;
+    } else {
+	stack.top2 -= bytes;
+	buf = (char*) stack.array + stack.top2;
+    }
+    
+    stack.used += bytes;
+    return buf;
+}
+
+
+void suser_free(int bytes, int which_end)
+{
+    if ( which_end == HEAD ) {
+	stack.top1 -= bytes;
+    } else {
+	stack.top2 += bytes;
+    }
+    stack.used -= bytes;
+}
+
+
+
+/*
+ * mem_usage consists of the following fields:
+ *    - for_lu (float)
+ *      The amount of space used in bytes for the L\U data structures.
+ *    - total_needed (float)
+ *      The amount of space needed in bytes to perform factorization.
+ *    - expansions (int)
+ *      Number of memory expansions during the LU factorization.
+ */
+int sQuerySpace(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage)
+{
+    SCformat *Lstore;
+    NCformat *Ustore;
+    register int n, iword, dword, panel_size = sp_ienv(1);
+
+    Lstore = L->Store;
+    Ustore = U->Store;
+    n = L->ncol;
+    iword = sizeof(int);
+    dword = sizeof(float);
+
+    /* For LU factors */
+    mem_usage->for_lu = (float)( (4*n + 3) * iword + Lstore->nzval_colptr[n] *
+				 dword + Lstore->rowind_colptr[n] * iword );
+    mem_usage->for_lu += (float)( (n + 1) * iword +
+				 Ustore->colptr[n] * (dword + iword) );
+
+    /* Working storage to support factorization */
+    mem_usage->total_needed = mem_usage->for_lu +
+	(float)( (2 * panel_size + 4 + NO_MARKER) * n * iword +
+		(panel_size + 1) * n * dword );
+
+    mem_usage->expansions = --no_expand;
+
+    return 0;
+} /* sQuerySpace */
+
+/*
+ * Allocate storage for the data structures common to all factor routines.
+ * For those unpredictable size, make a guess as FILL * nnz(A).
+ * Return value:
+ *     If lwork = -1, return the estimated amount of space required, plus n;
+ *     otherwise, return the amount of space actually allocated when
+ *     memory allocation failure occurred.
+ */
+int
+sLUMemInit(fact_t fact, void *work, int lwork, int m, int n, int annz,
+	  int panel_size, SuperMatrix *L, SuperMatrix *U, GlobalLU_t *Glu,
+	  int **iwork, float **dwork)
+{
+    int      info, iword, dword;
+    SCformat *Lstore;
+    NCformat *Ustore;
+    int      *xsup, *supno;
+    int      *lsub, *xlsub;
+    float   *lusup;
+    int      *xlusup;
+    float   *ucol;
+    int      *usub, *xusub;
+    int      nzlmax, nzumax, nzlumax;
+    int      FILL = sp_ienv(6);
+    
+    Glu->n    = n;
+    no_expand = 0;
+    iword     = sizeof(int);
+    dword     = sizeof(float);
+
+    if ( !expanders )	
+        expanders = (ExpHeader*)SUPERLU_MALLOC(NO_MEMTYPE * sizeof(ExpHeader));
+    if ( !expanders ) ABORT("SUPERLU_MALLOC fails for expanders");
+    
+    if ( fact != SamePattern_SameRowPerm ) {
+	/* Guess for L\U factors */
+	nzumax = nzlumax = FILL * annz;
+	nzlmax = SUPERLU_MAX(1, FILL/4.) * annz;
+
+	if ( lwork == -1 ) {
+	    return ( GluIntArray(n) * iword + TempSpace(m, panel_size)
+		    + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n );
+        } else {
+	    sSetupSpace(work, lwork, &Glu->MemModel);
+	}
+	
+#ifdef DEBUG		   
+	printf("sLUMemInit() called: annz %d, MemModel %d\n", 
+		annz, Glu->MemModel);
+#endif	
+	
+	/* Integer pointers for L\U factors */
+	if ( Glu->MemModel == SYSTEM ) {
+	    xsup   = intMalloc(n+1);
+	    supno  = intMalloc(n+1);
+	    xlsub  = intMalloc(n+1);
+	    xlusup = intMalloc(n+1);
+	    xusub  = intMalloc(n+1);
+	} else {
+	    xsup   = (int *)suser_malloc((n+1) * iword, HEAD);
+	    supno  = (int *)suser_malloc((n+1) * iword, HEAD);
+	    xlsub  = (int *)suser_malloc((n+1) * iword, HEAD);
+	    xlusup = (int *)suser_malloc((n+1) * iword, HEAD);
+	    xusub  = (int *)suser_malloc((n+1) * iword, HEAD);
+	}
+
+	lusup = (float *) sexpand( &nzlumax, LUSUP, 0, 0, Glu );
+	ucol  = (float *) sexpand( &nzumax, UCOL, 0, 0, Glu );
+	lsub  = (int *)    sexpand( &nzlmax, LSUB, 0, 0, Glu );
+	usub  = (int *)    sexpand( &nzumax, USUB, 0, 1, Glu );
+
+	while ( !lusup || !ucol || !lsub || !usub ) {
+	    if ( Glu->MemModel == SYSTEM ) {
+		SUPERLU_FREE(lusup); 
+		SUPERLU_FREE(ucol); 
+		SUPERLU_FREE(lsub); 
+		SUPERLU_FREE(usub);
+	    } else {
+		suser_free((nzlumax+nzumax)*dword+(nzlmax+nzumax)*iword, HEAD);
+	    }
+	    nzlumax /= 2;
+	    nzumax /= 2;
+	    nzlmax /= 2;
+	    if ( nzlumax < annz ) {
+		printf("Not enough memory to perform factorization.\n");
+		return (smemory_usage(nzlmax, nzumax, nzlumax, n) + n);
+	    }
+	    lusup = (float *) sexpand( &nzlumax, LUSUP, 0, 0, Glu );
+	    ucol  = (float *) sexpand( &nzumax, UCOL, 0, 0, Glu );
+	    lsub  = (int *)    sexpand( &nzlmax, LSUB, 0, 0, Glu );
+	    usub  = (int *)    sexpand( &nzumax, USUB, 0, 1, Glu );
+	}
+	
+    } else {
+	/* fact == SamePattern_SameRowPerm */
+	Lstore   = L->Store;
+	Ustore   = U->Store;
+	xsup     = Lstore->sup_to_col;
+	supno    = Lstore->col_to_sup;
+	xlsub    = Lstore->rowind_colptr;
+	xlusup   = Lstore->nzval_colptr;
+	xusub    = Ustore->colptr;
+	nzlmax   = Glu->nzlmax;    /* max from previous factorization */
+	nzumax   = Glu->nzumax;
+	nzlumax  = Glu->nzlumax;
+	
+	if ( lwork == -1 ) {
+	    return ( GluIntArray(n) * iword + TempSpace(m, panel_size)
+		    + (nzlmax+nzumax)*iword + (nzlumax+nzumax)*dword + n );
+        } else if ( lwork == 0 ) {
+	    Glu->MemModel = SYSTEM;
+	} else {
+	    Glu->MemModel = USER;
+	    stack.top2 = (lwork/4)*4; /* must be word-addressable */
+	    stack.size = stack.top2;
+	}
+	
+	lsub  = expanders[LSUB].mem  = Lstore->rowind;
+	lusup = expanders[LUSUP].mem = Lstore->nzval;
+	usub  = expanders[USUB].mem  = Ustore->rowind;
+	ucol  = expanders[UCOL].mem  = Ustore->nzval;;
+	expanders[LSUB].size         = nzlmax;
+	expanders[LUSUP].size        = nzlumax;
+	expanders[USUB].size         = nzumax;
+	expanders[UCOL].size         = nzumax;	
+    }
+
+    Glu->xsup    = xsup;
+    Glu->supno   = supno;
+    Glu->lsub    = lsub;
+    Glu->xlsub   = xlsub;
+    Glu->lusup   = lusup;
+    Glu->xlusup  = xlusup;
+    Glu->ucol    = ucol;
+    Glu->usub    = usub;
+    Glu->xusub   = xusub;
+    Glu->nzlmax  = nzlmax;
+    Glu->nzumax  = nzumax;
+    Glu->nzlumax = nzlumax;
+    
+    info = sLUWorkInit(m, n, panel_size, iwork, dwork, Glu->MemModel);
+    if ( info )
+	return ( info + smemory_usage(nzlmax, nzumax, nzlumax, n) + n);
+    
+    ++no_expand;
+    return 0;
+    
+} /* sLUMemInit */
+
+/* Allocate known working storage. Returns 0 if success, otherwise
+   returns the number of bytes allocated so far when failure occurred. */
+int
+sLUWorkInit(int m, int n, int panel_size, int **iworkptr, 
+            float **dworkptr, LU_space_t MemModel)
+{
+    int    isize, dsize, extra;
+    float *old_ptr;
+    int    maxsuper = sp_ienv(3),
+           rowblk   = sp_ienv(4);
+
+    isize = ( (2 * panel_size + 3 + NO_MARKER ) * m + n ) * sizeof(int);
+    dsize = (m * panel_size +
+	     NUM_TEMPV(m,panel_size,maxsuper,rowblk)) * sizeof(float);
+    
+    if ( MemModel == SYSTEM ) 
+	*iworkptr = (int *) intCalloc(isize/sizeof(int));
+    else
+	*iworkptr = (int *) suser_malloc(isize, TAIL);
+    if ( ! *iworkptr ) {
+	fprintf(stderr, "sLUWorkInit: malloc fails for local iworkptr[]\n");
+	return (isize + n);
+    }
+
+    if ( MemModel == SYSTEM )
+	*dworkptr = (float *) SUPERLU_MALLOC(dsize);
+    else {
+	*dworkptr = (float *) suser_malloc(dsize, TAIL);
+	if ( NotDoubleAlign(*dworkptr) ) {
+	    old_ptr = *dworkptr;
+	    *dworkptr = (float*) DoubleAlign(*dworkptr);
+	    *dworkptr = (float*) ((double*)*dworkptr - 1);
+	    extra = (char*)old_ptr - (char*)*dworkptr;
+#ifdef DEBUG	    
+	    printf("sLUWorkInit: not aligned, extra %d\n", extra);
+#endif	    
+	    stack.top2 -= extra;
+	    stack.used += extra;
+	}
+    }
+    if ( ! *dworkptr ) {
+	fprintf(stderr, "malloc fails for local dworkptr[].");
+	return (isize + dsize + n);
+    }
+	
+    return 0;
+}
+
+
+/*
+ * Set up pointers for real working arrays.
+ */
+void
+sSetRWork(int m, int panel_size, float *dworkptr,
+	 float **dense, float **tempv)
+{
+    float zero = 0.0;
+
+    int maxsuper = sp_ienv(3),
+        rowblk   = sp_ienv(4);
+    *dense = dworkptr;
+    *tempv = *dense + panel_size*m;
+    sfill (*dense, m * panel_size, zero);
+    sfill (*tempv, NUM_TEMPV(m,panel_size,maxsuper,rowblk), zero);     
+}
+	
+/*
+ * Free the working storage used by factor routines.
+ */
+void sLUWorkFree(int *iwork, float *dwork, GlobalLU_t *Glu)
+{
+    if ( Glu->MemModel == SYSTEM ) {
+	SUPERLU_FREE (iwork);
+	SUPERLU_FREE (dwork);
+    } else {
+	stack.used -= (stack.size - stack.top2);
+	stack.top2 = stack.size;
+/*	sStackCompress(Glu);  */
+    }
+    
+    SUPERLU_FREE (expanders);	
+    expanders = 0;
+}
+
+/* Expand the data structures for L and U during the factorization.
+ * Return value:   0 - successful return
+ *               > 0 - number of bytes allocated when run out of space
+ */
+int
+sLUMemXpand(int jcol,
+	   int next,          /* number of elements currently in the factors */
+	   MemType mem_type,  /* which type of memory to expand  */
+	   int *maxlen,       /* modified - maximum length of a data structure */
+	   GlobalLU_t *Glu    /* modified - global LU data structures */
+	   )
+{
+    void   *new_mem;
+    
+#ifdef DEBUG    
+    printf("sLUMemXpand(): jcol %d, next %d, maxlen %d, MemType %d\n",
+	   jcol, next, *maxlen, mem_type);
+#endif    
+
+    if (mem_type == USUB) 
+    	new_mem = sexpand(maxlen, mem_type, next, 1, Glu);
+    else
+	new_mem = sexpand(maxlen, mem_type, next, 0, Glu);
+    
+    if ( !new_mem ) {
+	int    nzlmax  = Glu->nzlmax;
+	int    nzumax  = Glu->nzumax;
+	int    nzlumax = Glu->nzlumax;
+    	fprintf(stderr, "Can't expand MemType %d: jcol %d\n", mem_type, jcol);
+    	return (smemory_usage(nzlmax, nzumax, nzlumax, Glu->n) + Glu->n);
+    }
+
+    switch ( mem_type ) {
+      case LUSUP:
+	Glu->lusup   = (float *) new_mem;
+	Glu->nzlumax = *maxlen;
+	break;
+      case UCOL:
+	Glu->ucol   = (float *) new_mem;
+	Glu->nzumax = *maxlen;
+	break;
+      case LSUB:
+	Glu->lsub   = (int *) new_mem;
+	Glu->nzlmax = *maxlen;
+	break;
+      case USUB:
+	Glu->usub   = (int *) new_mem;
+	Glu->nzumax = *maxlen;
+	break;
+    }
+    
+    return 0;
+    
+}
+
+
+
+void
+copy_mem_float(int howmany, void *old, void *new)
+{
+    register int i;
+    float *dold = old;
+    float *dnew = new;
+    for (i = 0; i < howmany; i++) dnew[i] = dold[i];
+}
+
+/*
+ * Expand the existing storage to accommodate more fill-ins.
+ */
+void
+*sexpand (
+	 int *prev_len,   /* length used from previous call */
+	 MemType type,    /* which part of the memory to expand */
+	 int len_to_copy, /* size of the memory to be copied to new store */
+	 int keep_prev,   /* = 1: use prev_len;
+			     = 0: compute new_len to expand */
+	 GlobalLU_t *Glu  /* modified - global LU data structures */
+	)
+{
+    float    EXPAND = 1.5;
+    float    alpha;
+    void     *new_mem, *old_mem;
+    int      new_len, tries, lword, extra, bytes_to_copy;
+
+    alpha = EXPAND;
+
+    if ( no_expand == 0 || keep_prev ) /* First time allocate requested */
+        new_len = *prev_len;
+    else {
+	new_len = alpha * *prev_len;
+    }
+    
+    if ( type == LSUB || type == USUB ) lword = sizeof(int);
+    else lword = sizeof(float);
+
+    if ( Glu->MemModel == SYSTEM ) {
+	new_mem = (void *) SUPERLU_MALLOC(new_len * lword);
+/*	new_mem = (void *) calloc(new_len, lword); */
+	if ( no_expand != 0 ) {
+	    tries = 0;
+	    if ( keep_prev ) {
+		if ( !new_mem ) return (NULL);
+	    } else {
+		while ( !new_mem ) {
+		    if ( ++tries > 10 ) return (NULL);
+		    alpha = Reduce(alpha);
+		    new_len = alpha * *prev_len;
+		    new_mem = (void *) SUPERLU_MALLOC(new_len * lword); 
+/*		    new_mem = (void *) calloc(new_len, lword); */
+		}
+	    }
+	    if ( type == LSUB || type == USUB ) {
+		copy_mem_int(len_to_copy, expanders[type].mem, new_mem);
+	    } else {
+		copy_mem_float(len_to_copy, expanders[type].mem, new_mem);
+	    }
+	    SUPERLU_FREE (expanders[type].mem);
+	}
+	expanders[type].mem = (void *) new_mem;
+	
+    } else { /* MemModel == USER */
+	if ( no_expand == 0 ) {
+	    new_mem = suser_malloc(new_len * lword, HEAD);
+	    if ( NotDoubleAlign(new_mem) &&
+		(type == LUSUP || type == UCOL) ) {
+		old_mem = new_mem;
+		new_mem = (void *)DoubleAlign(new_mem);
+		extra = (char*)new_mem - (char*)old_mem;
+#ifdef DEBUG		
+		printf("expand(): not aligned, extra %d\n", extra);
+#endif		
+		stack.top1 += extra;
+		stack.used += extra;
+	    }
+	    expanders[type].mem = (void *) new_mem;
+	}
+	else {
+	    tries = 0;
+	    extra = (new_len - *prev_len) * lword;
+	    if ( keep_prev ) {
+		if ( StackFull(extra) ) return (NULL);
+	    } else {
+		while ( StackFull(extra) ) {
+		    if ( ++tries > 10 ) return (NULL);
+		    alpha = Reduce(alpha);
+		    new_len = alpha * *prev_len;
+		    extra = (new_len - *prev_len) * lword;	    
+		}
+	    }
+
+	    if ( type != USUB ) {
+		new_mem = (void*)((char*)expanders[type + 1].mem + extra);
+		bytes_to_copy = (char*)stack.array + stack.top1
+		    - (char*)expanders[type + 1].mem;
+		user_bcopy(expanders[type+1].mem, new_mem, bytes_to_copy);
+
+		if ( type < USUB ) {
+		    Glu->usub = expanders[USUB].mem =
+			(void*)((char*)expanders[USUB].mem + extra);
+		}
+		if ( type < LSUB ) {
+		    Glu->lsub = expanders[LSUB].mem =
+			(void*)((char*)expanders[LSUB].mem + extra);
+		}
+		if ( type < UCOL ) {
+		    Glu->ucol = expanders[UCOL].mem =
+			(void*)((char*)expanders[UCOL].mem + extra);
+		}
+		stack.top1 += extra;
+		stack.used += extra;
+		if ( type == UCOL ) {
+		    stack.top1 += extra;   /* Add same amount for USUB */
+		    stack.used += extra;
+		}
+		
+	    } /* if ... */
+
+	} /* else ... */
+    }
+
+    expanders[type].size = new_len;
+    *prev_len = new_len;
+    if ( no_expand ) ++no_expand;
+    
+    return (void *) expanders[type].mem;
+    
+} /* sexpand */
+
+
+/*
+ * Compress the work[] array to remove fragmentation.
+ */
+void
+sStackCompress(GlobalLU_t *Glu)
+{
+    register int iword, dword, ndim;
+    char    *last, *fragment;
+    int      *ifrom, *ito;
+    float   *dfrom, *dto;
+    int      *xlsub, *lsub, *xusub, *usub, *xlusup;
+    float   *ucol, *lusup;
+    
+    iword = sizeof(int);
+    dword = sizeof(float);
+    ndim = Glu->n;
+
+    xlsub  = Glu->xlsub;
+    lsub   = Glu->lsub;
+    xusub  = Glu->xusub;
+    usub   = Glu->usub;
+    xlusup = Glu->xlusup;
+    ucol   = Glu->ucol;
+    lusup  = Glu->lusup;
+    
+    dfrom = ucol;
+    dto = (float *)((char*)lusup + xlusup[ndim] * dword);
+    copy_mem_float(xusub[ndim], dfrom, dto);
+    ucol = dto;
+
+    ifrom = lsub;
+    ito = (int *) ((char*)ucol + xusub[ndim] * iword);
+    copy_mem_int(xlsub[ndim], ifrom, ito);
+    lsub = ito;
+    
+    ifrom = usub;
+    ito = (int *) ((char*)lsub + xlsub[ndim] * iword);
+    copy_mem_int(xusub[ndim], ifrom, ito);
+    usub = ito;
+    
+    last = (char*)usub + xusub[ndim] * iword;
+    fragment = (char*) (((char*)stack.array + stack.top1) - last);
+    stack.used -= (long int) fragment;
+    stack.top1 -= (long int) fragment;
+
+    Glu->ucol = ucol;
+    Glu->lsub = lsub;
+    Glu->usub = usub;
+    
+#ifdef DEBUG
+    printf("sStackCompress: fragment %d\n", fragment);
+    /* for (last = 0; last < ndim; ++last)
+	print_lu_col("After compress:", last, 0);*/
+#endif    
+    
+}
+
+/*
+ * Allocate storage for original matrix A
+ */
+void
+sallocateA(int n, int nnz, float **a, int **asub, int **xa)
+{
+    *a    = (float *) floatMalloc(nnz);
+    *asub = (int *) intMalloc(nnz);
+    *xa   = (int *) intMalloc(n+1);
+}
+
+
+float *floatMalloc(int n)
+{
+    float *buf;
+    buf = (float *) SUPERLU_MALLOC(n * sizeof(float)); 
+    if ( !buf ) {
+	ABORT("SUPERLU_MALLOC failed for buf in floatMalloc()\n");
+    }
+    return (buf);
+}
+
+float *floatCalloc(int n)
+{
+    float *buf;
+    register int i;
+    float zero = 0.0;
+    buf = (float *) SUPERLU_MALLOC(n * sizeof(float));
+    if ( !buf ) {
+	ABORT("SUPERLU_MALLOC failed for buf in floatCalloc()\n");
+    }
+    for (i = 0; i < n; ++i) buf[i] = zero;
+    return (buf);
+}
+
+
+int smemory_usage(const int nzlmax, const int nzumax, 
+		  const int nzlumax, const int n)
+{
+    register int iword, dword;
+
+    iword   = sizeof(int);
+    dword   = sizeof(float);
+    
+    return (10 * n * iword +
+	    nzlmax * iword + nzumax * (iword + dword) + nzlumax * dword);
+
+}
diff --git a/intern/opennl/superlu/smyblas2.c b/intern/opennl/superlu/smyblas2.c
new file mode 100644
index 00000000000..729e17f7674
--- /dev/null
+++ b/intern/opennl/superlu/smyblas2.c
@@ -0,0 +1,225 @@
+
+
+/*
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ */
+/*
+ * File name:		smyblas2.c
+ * Purpose:
+ *     Level 2 BLAS operations: solves and matvec, written in C.
+ * Note:
+ *     This is only used when the system lacks an efficient BLAS library.
+ */
+
+/*
+ * Solves a dense UNIT lower triangular system. The unit lower 
+ * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). 
+ * The solution will be returned in the rhs vector.
+ */
+void slsolve ( int ldm, int ncol, float *M, float *rhs )
+{
+    int k;
+    float x0, x1, x2, x3, x4, x5, x6, x7;
+    float *M0;
+    register float *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7;
+    register int firstcol = 0;
+
+    M0 = &M[0];
+
+    while ( firstcol < ncol - 7 ) { /* Do 8 columns */
+      Mki0 = M0 + 1;
+      Mki1 = Mki0 + ldm + 1;
+      Mki2 = Mki1 + ldm + 1;
+      Mki3 = Mki2 + ldm + 1;
+      Mki4 = Mki3 + ldm + 1;
+      Mki5 = Mki4 + ldm + 1;
+      Mki6 = Mki5 + ldm + 1;
+      Mki7 = Mki6 + ldm + 1;
+
+      x0 = rhs[firstcol];
+      x1 = rhs[firstcol+1] - x0 * *Mki0++;
+      x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++;
+      x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++;
+      x4 = rhs[firstcol+4] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
+	                   - x3 * *Mki3++;
+      x5 = rhs[firstcol+5] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
+	                   - x3 * *Mki3++ - x4 * *Mki4++;
+      x6 = rhs[firstcol+6] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
+	                   - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++;
+      x7 = rhs[firstcol+7] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++
+	                   - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++
+			   - x6 * *Mki6++;
+
+      rhs[++firstcol] = x1;
+      rhs[++firstcol] = x2;
+      rhs[++firstcol] = x3;
+      rhs[++firstcol] = x4;
+      rhs[++firstcol] = x5;
+      rhs[++firstcol] = x6;
+      rhs[++firstcol] = x7;
+      ++firstcol;
+    
+      for (k = firstcol; k < ncol; k++)
+	rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++
+	                - x2 * *Mki2++ - x3 * *Mki3++
+                        - x4 * *Mki4++ - x5 * *Mki5++
+			- x6 * *Mki6++ - x7 * *Mki7++;
+ 
+      M0 += 8 * ldm + 8;
+    }
+
+    while ( firstcol < ncol - 3 ) { /* Do 4 columns */
+      Mki0 = M0 + 1;
+      Mki1 = Mki0 + ldm + 1;
+      Mki2 = Mki1 + ldm + 1;
+      Mki3 = Mki2 + ldm + 1;
+
+      x0 = rhs[firstcol];
+      x1 = rhs[firstcol+1] - x0 * *Mki0++;
+      x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++;
+      x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++;
+
+      rhs[++firstcol] = x1;
+      rhs[++firstcol] = x2;
+      rhs[++firstcol] = x3;
+      ++firstcol;
+    
+      for (k = firstcol; k < ncol; k++)
+	rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++
+	                - x2 * *Mki2++ - x3 * *Mki3++;
+ 
+      M0 += 4 * ldm + 4;
+    }
+
+    if ( firstcol < ncol - 1 ) { /* Do 2 columns */
+      Mki0 = M0 + 1;
+      Mki1 = Mki0 + ldm + 1;
+
+      x0 = rhs[firstcol];
+      x1 = rhs[firstcol+1] - x0 * *Mki0++;
+
+      rhs[++firstcol] = x1;
+      ++firstcol;
+    
+      for (k = firstcol; k < ncol; k++)
+	rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++;
+ 
+    }
+    
+}
+
+/*
+ * Solves a dense upper triangular system. The upper triangular matrix is
+ * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned
+ * in the rhs vector.
+ */
+void
+susolve ( ldm, ncol, M, rhs )
+int ldm;	/* in */
+int ncol;	/* in */
+float *M;	/* in */
+float *rhs;	/* modified */
+{
+    float xj;
+    int jcol, j, irow;
+
+    jcol = ncol - 1;
+
+    for (j = 0; j < ncol; j++) {
+
+	xj = rhs[jcol] / M[jcol + jcol*ldm]; 		/* M(jcol, jcol) */
+	rhs[jcol] = xj;
+	
+	for (irow = 0; irow < jcol; irow++)
+	    rhs[irow] -= xj * M[irow + jcol*ldm];	/* M(irow, jcol) */
+
+	jcol--;
+
+    }
+}
+
+
+/*
+ * Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec.
+ * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[].
+ */
+void smatvec ( ldm, nrow, ncol, M, vec, Mxvec )
+
+int ldm;	/* in -- leading dimension of M */
+int nrow;	/* in */ 
+int ncol;	/* in */
+float *M;	/* in */
+float *vec;	/* in */
+float *Mxvec;	/* in/out */
+
+{
+    float vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7;
+    float *M0;
+    register float *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7;
+    register int firstcol = 0;
+    int k;
+
+    M0 = &M[0];
+    while ( firstcol < ncol - 7 ) {	/* Do 8 columns */
+
+	Mki0 = M0;
+	Mki1 = Mki0 + ldm;
+        Mki2 = Mki1 + ldm;
+        Mki3 = Mki2 + ldm;
+	Mki4 = Mki3 + ldm;
+	Mki5 = Mki4 + ldm;
+	Mki6 = Mki5 + ldm;
+	Mki7 = Mki6 + ldm;
+
+	vi0 = vec[firstcol++];
+	vi1 = vec[firstcol++];
+	vi2 = vec[firstcol++];
+	vi3 = vec[firstcol++];	
+	vi4 = vec[firstcol++];
+	vi5 = vec[firstcol++];
+	vi6 = vec[firstcol++];
+	vi7 = vec[firstcol++];	
+
+	for (k = 0; k < nrow; k++) 
+	    Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++
+		      + vi2 * *Mki2++ + vi3 * *Mki3++ 
+		      + vi4 * *Mki4++ + vi5 * *Mki5++
+		      + vi6 * *Mki6++ + vi7 * *Mki7++;
+
+	M0 += 8 * ldm;
+    }
+
+    while ( firstcol < ncol - 3 ) {	/* Do 4 columns */
+
+	Mki0 = M0;
+	Mki1 = Mki0 + ldm;
+	Mki2 = Mki1 + ldm;
+	Mki3 = Mki2 + ldm;
+
+	vi0 = vec[firstcol++];
+	vi1 = vec[firstcol++];
+	vi2 = vec[firstcol++];
+	vi3 = vec[firstcol++];	
+	for (k = 0; k < nrow; k++) 
+	    Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++
+		      + vi2 * *Mki2++ + vi3 * *Mki3++ ;
+
+	M0 += 4 * ldm;
+    }
+
+    while ( firstcol < ncol ) {		/* Do 1 column */
+
+ 	Mki0 = M0;
+	vi0 = vec[firstcol++];
+	for (k = 0; k < nrow; k++)
+	    Mxvec[k] += vi0 * *Mki0++;
+
+	M0 += ldm;
+    }
+	
+}
+
diff --git a/intern/opennl/superlu/sp_coletree.c b/intern/opennl/superlu/sp_coletree.c
new file mode 100644
index 00000000000..d49919167f5
--- /dev/null
+++ b/intern/opennl/superlu/sp_coletree.c
@@ -0,0 +1,332 @@
+
+/*  Elimination tree computation and layout routines */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "ssp_defs.h"
+
+/* 
+ *  Implementation of disjoint set union routines.
+ *  Elements are integers in 0..n-1, and the 
+ *  names of the sets themselves are of type int.
+ *  
+ *  Calls are:
+ *  initialize_disjoint_sets (n) initial call.
+ *  s = make_set (i)             returns a set containing only i.
+ *  s = link (t, u)		 returns s = t union u, destroying t and u.
+ *  s = find (i)		 return name of set containing i.
+ *  finalize_disjoint_sets 	 final call.
+ *
+ *  This implementation uses path compression but not weighted union.
+ *  See Tarjan's book for details.
+ *  John Gilbert, CMI, 1987.
+ *
+ *  Implemented path-halving by XSL 07/05/95.
+ */
+
+static int	*pp;		/* parent array for sets */
+
+static 
+int *mxCallocInt(int n)
+{
+    register int i;
+    int *buf;
+
+    buf = (int *) SUPERLU_MALLOC( n * sizeof(int) );
+    if ( !buf ) {
+         ABORT("SUPERLU_MALLOC fails for buf in mxCallocInt()");
+       }
+    for (i = 0; i < n; i++) buf[i] = 0;
+    return (buf);
+}
+      
+static
+void initialize_disjoint_sets (
+	int n
+	)
+{
+	pp = mxCallocInt(n);
+}
+
+
+static
+int make_set (
+	int i
+	)
+{
+	pp[i] = i;
+	return i;
+}
+
+
+static
+int link (
+	int s,
+	int t
+	)
+{
+	pp[s] = t;
+	return t;
+}
+
+
+/* PATH HALVING */
+static
+int find (int i)
+{
+    register int p, gp;
+    
+    p = pp[i];
+    gp = pp[p];
+    while (gp != p) {
+	pp[i] = gp;
+	i = gp;
+	p = pp[i];
+	gp = pp[p];
+    }
+    return (p);
+}
+
+#if 0
+/* PATH COMPRESSION */
+static
+int find (
+	int i
+	)
+{
+	if (pp[i] != i) 
+		pp[i] = find (pp[i]);
+	return pp[i];
+}
+#endif
+
+static
+void finalize_disjoint_sets (
+	void
+	)
+{
+	SUPERLU_FREE(pp);
+}
+
+
+/*
+ *      Find the elimination tree for A'*A.
+ *      This uses something similar to Liu's algorithm. 
+ *      It runs in time O(nz(A)*log n) and does not form A'*A.
+ *
+ *      Input:
+ *        Sparse matrix A.  Numeric values are ignored, so any
+ *        explicit zeros are treated as nonzero.
+ *      Output:
+ *        Integer array of parents representing the elimination
+ *        tree of the symbolic product A'*A.  Each vertex is a
+ *        column of A, and nc means a root of the elimination forest.
+ *
+ *      John R. Gilbert, Xerox, 10 Dec 1990
+ *      Based on code by JRG dated 1987, 1988, and 1990.
+ */
+
+/*
+ * Nonsymmetric elimination tree
+ */
+int
+sp_coletree(
+	    int *acolst, int *acolend, /* column start and end past 1 */
+	    int *arow,                 /* row indices of A */
+	    int nr, int nc,            /* dimension of A */
+	    int *parent	               /* parent in elim tree */
+	    )
+{
+	int	*root;			/* root of subtee of etree 	*/
+	int     *firstcol;		/* first nonzero col in each row*/
+	int	rset, cset;             
+	int	row, col;
+	int	rroot;
+	int	p;
+
+	root = mxCallocInt (nc);
+	initialize_disjoint_sets (nc);
+
+	/* Compute firstcol[row] = first nonzero column in row */
+
+	firstcol = mxCallocInt (nr);
+	for (row = 0; row < nr; firstcol[row++] = nc);
+	for (col = 0; col < nc; col++) 
+		for (p = acolst[col]; p < acolend[col]; p++) {
+			row = arow[p];
+			firstcol[row] = SUPERLU_MIN(firstcol[row], col);
+		}
+
+	/* Compute etree by Liu's algorithm for symmetric matrices,
+           except use (firstcol[r],c) in place of an edge (r,c) of A.
+	   Thus each row clique in A'*A is replaced by a star
+	   centered at its first vertex, which has the same fill. */
+
+	for (col = 0; col < nc; col++) {
+		cset = make_set (col);
+		root[cset] = col;
+		parent[col] = nc; /* Matlab */
+		for (p = acolst[col]; p < acolend[col]; p++) {
+			row = firstcol[arow[p]];
+			if (row >= col) continue;
+			rset = find (row);
+			rroot = root[rset];
+			if (rroot != col) {
+				parent[rroot] = col;
+				cset = link (cset, rset);
+				root[cset] = col;
+			}
+		}
+	}
+
+	SUPERLU_FREE (root);
+	SUPERLU_FREE (firstcol);
+	finalize_disjoint_sets ();
+	return 0;
+}
+
+/*
+ *  q = TreePostorder (n, p);
+ *
+ *	Postorder a tree.
+ *	Input:
+ *	  p is a vector of parent pointers for a forest whose
+ *        vertices are the integers 0 to n-1; p[root]==n.
+ *	Output:
+ *	  q is a vector indexed by 0..n-1 such that q[i] is the
+ *	  i-th vertex in a postorder numbering of the tree.
+ *
+ *        ( 2/7/95 modified by X.Li:
+ *          q is a vector indexed by 0:n-1 such that vertex i is the
+ *          q[i]-th vertex in a postorder numbering of the tree.
+ *          That is, this is the inverse of the previous q. )
+ *
+ *	In the child structure, lower-numbered children are represented
+ *	first, so that a tree which is already numbered in postorder
+ *	will not have its order changed.
+ *    
+ *  Written by John Gilbert, Xerox, 10 Dec 1990.
+ *  Based on code written by John Gilbert at CMI in 1987.
+ */
+
+static int	*first_kid, *next_kid;	/* Linked list of children.	*/
+static int	*post, postnum;
+
+static
+/*
+ * Depth-first search from vertex v.
+ */
+void etdfs (
+	int	v
+	)
+{
+	int	w;
+
+	for (w = first_kid[v]; w != -1; w = next_kid[w]) {
+		etdfs (w);
+	}
+	/* post[postnum++] = v; in Matlab */
+	post[v] = postnum++;    /* Modified by X.Li on 2/14/95 */
+}
+
+
+/*
+ * Post order a tree
+ */
+int *TreePostorder(
+	int n,
+	int *parent
+)
+{
+	int	v, dad;
+
+	/* Allocate storage for working arrays and results	*/
+	first_kid = 	mxCallocInt (n+1);
+	next_kid  = 	mxCallocInt (n+1);
+	post	  = 	mxCallocInt (n+1);
+
+	/* Set up structure describing children */
+	for (v = 0; v <= n; first_kid[v++] = -1);
+	for (v = n-1; v >= 0; v--) {
+		dad = parent[v];
+		next_kid[v] = first_kid[dad];
+		first_kid[dad] = v;
+	}
+
+	/* Depth-first search from dummy root vertex #n */
+	postnum = 0;
+	etdfs (n);
+
+	SUPERLU_FREE (first_kid);
+	SUPERLU_FREE (next_kid);
+	return post;
+}
+
+
+/*
+ *      p = spsymetree (A);
+ *
+ *      Find the elimination tree for symmetric matrix A.
+ *      This uses Liu's algorithm, and runs in time O(nz*log n).
+ *
+ *      Input:
+ *        Square sparse matrix A.  No check is made for symmetry;
+ *        elements below and on the diagonal are ignored.
+ *        Numeric values are ignored, so any explicit zeros are 
+ *        treated as nonzero.
+ *      Output:
+ *        Integer array of parents representing the etree, with n
+ *        meaning a root of the elimination forest.
+ *      Note:  
+ *        This routine uses only the upper triangle, while sparse
+ *        Cholesky (as in spchol.c) uses only the lower.  Matlab's
+ *        dense Cholesky uses only the upper.  This routine could
+ *        be modified to use the lower triangle either by transposing
+ *        the matrix or by traversing it by rows with auxiliary
+ *        pointer and link arrays.
+ *
+ *      John R. Gilbert, Xerox, 10 Dec 1990
+ *      Based on code by JRG dated 1987, 1988, and 1990.
+ *      Modified by X.S. Li, November 1999.
+ */
+
+/*
+ * Symmetric elimination tree
+ */
+int
+sp_symetree(
+	    int *acolst, int *acolend, /* column starts and ends past 1 */
+	    int *arow,            /* row indices of A */
+	    int n,                /* dimension of A */
+	    int *parent	    /* parent in elim tree */
+	    )
+{
+	int	*root;		    /* root of subtree of etree 	*/
+	int	rset, cset;             
+	int	row, col;
+	int	rroot;
+	int	p;
+
+	root = mxCallocInt (n);
+	initialize_disjoint_sets (n);
+
+	for (col = 0; col < n; col++) {
+		cset = make_set (col);
+		root[cset] = col;
+		parent[col] = n; /* Matlab */
+		for (p = acolst[col]; p < acolend[col]; p++) {
+			row = arow[p];
+			if (row >= col) continue;
+			rset = find (row);
+			rroot = root[rset];
+			if (rroot != col) {
+				parent[rroot] = col;
+				cset = link (cset, rset);
+				root[cset] = col;
+			}
+		}
+	}
+	SUPERLU_FREE (root);
+	finalize_disjoint_sets ();
+	return 0;
+} /* SP_SYMETREE */
diff --git a/intern/opennl/superlu/sp_ienv.c b/intern/opennl/superlu/sp_ienv.c
new file mode 100644
index 00000000000..5b0ba7b2151
--- /dev/null
+++ b/intern/opennl/superlu/sp_ienv.c
@@ -0,0 +1,65 @@
+/*
+ * File name:		sp_ienv.c
+ * History:             Modified from lapack routine ILAENV
+ */
+
+#include "ssp_defs.h"
+#include "util.h"
+
+int
+sp_ienv(int ispec)
+{
+/*
+    Purpose   
+    =======   
+
+    sp_ienv() is inquired to choose machine-dependent parameters for the
+    local environment. See ISPEC for a description of the parameters.   
+
+    This version provides a set of parameters which should give good,   
+    but not optimal, performance on many of the currently available   
+    computers.  Users are encouraged to modify this subroutine to set   
+    the tuning parameters for their particular machine using the option   
+    and problem size information in the arguments.   
+
+    Arguments   
+    =========   
+
+    ISPEC   (input) int
+            Specifies the parameter to be returned as the value of SP_IENV.   
+            = 1: the panel size w; a panel consists of w consecutive
+	         columns of matrix A in the process of Gaussian elimination.
+		 The best value depends on machine's cache characters.
+            = 2: the relaxation parameter relax; if the number of
+	         nodes (columns) in a subtree of the elimination tree is less
+		 than relax, this subtree is considered as one supernode,
+		 regardless of their row structures.
+            = 3: the maximum size for a supernode;
+	    = 4: the minimum row dimension for 2-D blocking to be used;
+	    = 5: the minimum column dimension for 2-D blocking to be used;
+	    = 6: the estimated fills factor for L and U, compared with A;
+	    
+   (SP_IENV) (output) int
+            >= 0: the value of the parameter specified by ISPEC   
+            < 0:  if SP_IENV = -k, the k-th argument had an illegal value. 
+  
+    ===================================================================== 
+*/
+    int i;
+
+    switch (ispec) {
+	case 1: return (10);
+	case 2: return (5);
+	case 3: return (100);
+	case 4: return (200);
+	case 5: return (40);
+        case 6: return (20);
+    }
+
+    /* Invalid value for ISPEC */
+    i = 1;
+    xerbla_("sp_ienv", &i);
+    return 0;
+
+} /* sp_ienv_ */
+
diff --git a/intern/opennl/superlu/sp_preorder.c b/intern/opennl/superlu/sp_preorder.c
new file mode 100644
index 00000000000..f82da2de1aa
--- /dev/null
+++ b/intern/opennl/superlu/sp_preorder.c
@@ -0,0 +1,203 @@
+#include "ssp_defs.h"
+
+void
+sp_preorder(superlu_options_t *options,  SuperMatrix *A, int *perm_c, 
+	    int *etree, SuperMatrix *AC)
+{
+/*
+ * Purpose
+ * =======
+ *
+ * sp_preorder() permutes the columns of the original matrix. It performs
+ * the following steps:
+ *
+ *    1. Apply column permutation perm_c[] to A's column pointers to form AC;
+ *
+ *    2. If options->Fact = DOFACT, then
+ *       (1) Compute column elimination tree etree[] of AC'AC;
+ *       (2) Post order etree[] to get a postordered elimination tree etree[],
+ *           and a postorder permutation post[];
+ *       (3) Apply post[] permutation to columns of AC;
+ *       (4) Overwrite perm_c[] with the product perm_c * post.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_options_t*
+ *         Specifies whether or not the elimination tree will be re-used.
+ *         If options->Fact == DOFACT, this means first time factor A, 
+ *         etree is computed, postered, and output.
+ *         Otherwise, re-factor A, etree is input, unchanged on exit.
+ *
+ * A       (input) SuperMatrix*
+ *         Matrix A in A*X=B, of dimension (A->nrow, A->ncol). The number
+ *         of the linear equations is A->nrow. Currently, the type of A can be:
+ *         Stype = NC or SLU_NCP; Mtype = SLU_GE.
+ *         In the future, more general A may be handled.
+ *
+ * perm_c  (input/output) int*
+ *	   Column permutation vector of size A->ncol, which defines the 
+ *         permutation matrix Pc; perm_c[i] = j means column i of A is 
+ *         in position j in A*Pc.
+ *         If options->Fact == DOFACT, perm_c is both input and output.
+ *         On output, it is changed according to a postorder of etree.
+ *         Otherwise, perm_c is input.
+ *
+ * etree   (input/output) int*
+ *         Elimination tree of Pc'*A'*A*Pc, dimension A->ncol.
+ *         If options->Fact == DOFACT, etree is an output argument,
+ *         otherwise it is an input argument.
+ *         Note: etree is a vector of parent pointers for a forest whose
+ *         vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ * AC      (output) SuperMatrix*
+ *         The resulting matrix after applied the column permutation
+ *         perm_c[] to matrix A. The type of AC can be:
+ *         Stype = SLU_NCP; Dtype = A->Dtype; Mtype = SLU_GE.
+ *
+ */
+
+    NCformat  *Astore;
+    NCPformat *ACstore;
+    int       *iwork, *post;
+    register  int n, i;
+
+    n = A->ncol;
+    
+    /* Apply column permutation perm_c to A's column pointers so to
+       obtain NCP format in AC = A*Pc.  */
+    AC->Stype       = SLU_NCP;
+    AC->Dtype       = A->Dtype;
+    AC->Mtype       = A->Mtype;
+    AC->nrow        = A->nrow;
+    AC->ncol        = A->ncol;
+    Astore          = A->Store;
+    ACstore = AC->Store = (void *) SUPERLU_MALLOC( sizeof(NCPformat) );
+    if ( !ACstore ) ABORT("SUPERLU_MALLOC fails for ACstore");
+    ACstore->nnz    = Astore->nnz;
+    ACstore->nzval  = Astore->nzval;
+    ACstore->rowind = Astore->rowind;
+    ACstore->colbeg = (int*) SUPERLU_MALLOC(n*sizeof(int));
+    if ( !(ACstore->colbeg) ) ABORT("SUPERLU_MALLOC fails for ACstore->colbeg");
+    ACstore->colend = (int*) SUPERLU_MALLOC(n*sizeof(int));
+    if ( !(ACstore->colend) ) ABORT("SUPERLU_MALLOC fails for ACstore->colend");
+
+#ifdef DEBUG
+    print_int_vec("pre_order:", n, perm_c);
+    check_perm("Initial perm_c", n, perm_c);
+#endif      
+
+    for (i = 0; i < n; i++) {
+	ACstore->colbeg[perm_c[i]] = Astore->colptr[i]; 
+	ACstore->colend[perm_c[i]] = Astore->colptr[i+1];
+    }
+	
+    if ( options->Fact == DOFACT ) {
+#undef ETREE_ATplusA
+#ifdef ETREE_ATplusA
+        /*--------------------------------------------
+	  COMPUTE THE ETREE OF Pc*(A'+A)*Pc'.
+	  --------------------------------------------*/
+        int *b_colptr, *b_rowind, bnz, j;
+	int *c_colbeg, *c_colend;
+
+        /*printf("Use etree(A'+A)\n");*/
+
+	/* Form B = A + A'. */
+	at_plus_a(n, Astore->nnz, Astore->colptr, Astore->rowind,
+		  &bnz, &b_colptr, &b_rowind);
+
+	/* Form C = Pc*B*Pc'. */
+	c_colbeg = (int*) SUPERLU_MALLOC(2*n*sizeof(int));
+	c_colend = c_colbeg + n;
+	if (!c_colbeg ) ABORT("SUPERLU_MALLOC fails for c_colbeg/c_colend");
+	for (i = 0; i < n; i++) {
+	    c_colbeg[perm_c[i]] = b_colptr[i]; 
+  	    c_colend[perm_c[i]] = b_colptr[i+1];
+	}
+	for (j = 0; j < n; ++j) {
+	    for (i = c_colbeg[j]; i < c_colend[j]; ++i) {
+	        b_rowind[i] = perm_c[b_rowind[i]];
+	    }
+	}
+
+	/* Compute etree of C. */
+	sp_symetree(c_colbeg, c_colend, b_rowind, n, etree);
+
+	SUPERLU_FREE(b_colptr);
+	if ( bnz ) SUPERLU_FREE(b_rowind);
+	SUPERLU_FREE(c_colbeg);
+	
+#else
+        /*--------------------------------------------
+	  COMPUTE THE COLUMN ELIMINATION TREE.
+	  --------------------------------------------*/
+	sp_coletree(ACstore->colbeg, ACstore->colend, ACstore->rowind,
+		    A->nrow, A->ncol, etree);
+#endif
+#ifdef DEBUG	
+	print_int_vec("etree:", n, etree);
+#endif	
+	
+	/* In symmetric mode, do not do postorder here. */
+	if ( options->SymmetricMode == NO ) {
+	    /* Post order etree */
+	    post = (int *) TreePostorder(n, etree);
+	    /* for (i = 0; i < n+1; ++i) inv_post[post[i]] = i;
+	       iwork = post; */
+
+#ifdef DEBUG
+	    print_int_vec("post:", n+1, post);
+	    check_perm("post", n, post);	
+#endif	
+	    iwork = (int*) SUPERLU_MALLOC((n+1)*sizeof(int)); 
+	    if ( !iwork ) ABORT("SUPERLU_MALLOC fails for iwork[]");
+
+	    /* Renumber etree in postorder */
+	    for (i = 0; i < n; ++i) iwork[post[i]] = post[etree[i]];
+	    for (i = 0; i < n; ++i) etree[i] = iwork[i];
+
+#ifdef DEBUG	
+	    print_int_vec("postorder etree:", n, etree);
+#endif
+	
+	    /* Postmultiply A*Pc by post[] */
+	    for (i = 0; i < n; ++i) iwork[post[i]] = ACstore->colbeg[i];
+	    for (i = 0; i < n; ++i) ACstore->colbeg[i] = iwork[i];
+	    for (i = 0; i < n; ++i) iwork[post[i]] = ACstore->colend[i];
+	    for (i = 0; i < n; ++i) ACstore->colend[i] = iwork[i];
+
+	    for (i = 0; i < n; ++i)
+	        iwork[i] = post[perm_c[i]];  /* product of perm_c and post */
+	    for (i = 0; i < n; ++i) perm_c[i] = iwork[i];
+
+#ifdef DEBUG
+	    print_int_vec("Pc*post:", n, perm_c);
+	    check_perm("final perm_c", n, perm_c);	
+#endif
+	    SUPERLU_FREE (post);
+	    SUPERLU_FREE (iwork);
+	} /* end postordering */
+
+    } /* if options->Fact == DOFACT ... */
+
+}
+
+int check_perm(char *what, int n, int *perm)
+{
+    register int i;
+    int          *marker;
+    marker = (int *) calloc(n, sizeof(int));
+
+    for (i = 0; i < n; ++i) {
+	if ( marker[perm[i]] == 1 || perm[i] >= n ) {
+	    printf("%s: Not a valid PERM[%d] = %d\n", what, i, perm[i]);
+	    ABORT("check_perm");
+	} else {
+	    marker[perm[i]] = 1;
+	}
+    }
+
+    SUPERLU_FREE(marker);
+    return 0;
+}
diff --git a/intern/opennl/superlu/spanel_bmod.c b/intern/opennl/superlu/spanel_bmod.c
new file mode 100644
index 00000000000..a59a9086df1
--- /dev/null
+++ b/intern/opennl/superlu/spanel_bmod.c
@@ -0,0 +1,449 @@
+
+/*
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ */
+/*
+  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ 
+  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ 
+  Permission is hereby granted to use or copy this program for any
+  purpose, provided the above notices are retained on all copies.
+  Permission to modify the code and to distribute modified code is
+  granted, provided the above notices are retained, and a notice that
+  the code was modified is included with the above copyright notice.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "ssp_defs.h"
+
+/* 
+ * Function prototypes 
+ */
+void slsolve(int, int, float *, float *);
+void smatvec(int, int, int, float *, float *, float *);
+extern void scheck_tempv();
+
+void
+spanel_bmod (
+	    const int  m,          /* in - number of rows in the matrix */
+	    const int  w,          /* in */
+	    const int  jcol,       /* in */
+	    const int  nseg,       /* in */
+	    float     *dense,     /* out, of size n by w */
+	    float     *tempv,     /* working array */
+	    int        *segrep,    /* in */
+	    int        *repfnz,    /* in, of size n by w */
+	    GlobalLU_t *Glu,       /* modified */
+	    SuperLUStat_t *stat    /* output */
+	    )
+{
+/* 
+ * Purpose
+ * =======
+ *
+ *    Performs numeric block updates (sup-panel) in topological order.
+ *    It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
+ *    Special processing on the supernodal portion of L\U[*,j]
+ *
+ *    Before entering this routine, the original nonzeros in the panel 
+ *    were already copied into the spa[m,w].
+ *
+ *    Updated/Output parameters-
+ *	dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned 
+ *      collectively in the m-by-w vector dense[*]. 
+ *
+ */
+
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+    _fcd ftcs1 = _cptofcd("L", strlen("L")),
+         ftcs2 = _cptofcd("N", strlen("N")),
+         ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+    int          incx = 1, incy = 1;
+    float       alpha, beta;
+#endif
+
+    register int k, ksub;
+    int          fsupc, nsupc, nsupr, nrow;
+    int          krep, krep_ind;
+    float       ukj, ukj1, ukj2;
+    int          luptr, luptr1, luptr2;
+    int          segsze;
+    int          block_nrow;  /* no of rows in a block row */
+    register int lptr;	      /* Points to the row subscripts of a supernode */
+    int          kfnz, irow, no_zeros; 
+    register int isub, isub1, i;
+    register int jj;	      /* Index through each column in the panel */
+    int          *xsup, *supno;
+    int          *lsub, *xlsub;
+    float       *lusup;
+    int          *xlusup;
+    int          *repfnz_col; /* repfnz[] for a column in the panel */
+    float       *dense_col;  /* dense[] for a column in the panel */
+    float       *tempv1;             /* Used in 1-D update */
+    float       *TriTmp, *MatvecTmp; /* used in 2-D update */
+    float      zero = 0.0;
+    register int ldaTmp;
+    register int r_ind, r_hi;
+    static   int first = 1, maxsuper, rowblk, colblk;
+    flops_t  *ops = stat->ops;
+    
+    xsup    = Glu->xsup;
+    supno   = Glu->supno;
+    lsub    = Glu->lsub;
+    xlsub   = Glu->xlsub;
+    lusup   = Glu->lusup;
+    xlusup  = Glu->xlusup;
+    
+    if ( first ) {
+	maxsuper = sp_ienv(3);
+	rowblk   = sp_ienv(4);
+	colblk   = sp_ienv(5);
+	first = 0;
+    }
+    ldaTmp = maxsuper + rowblk;
+
+    /* 
+     * For each nonz supernode segment of U[*,j] in topological order 
+     */
+    k = nseg - 1;
+    for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */
+
+	/* krep = representative of current k-th supernode
+	 * fsupc = first supernodal column
+	 * nsupc = no of columns in a supernode
+	 * nsupr = no of rows in a supernode
+	 */
+        krep = segrep[k--];
+	fsupc = xsup[supno[krep]];
+	nsupc = krep - fsupc + 1;
+	nsupr = xlsub[fsupc+1] - xlsub[fsupc];
+	nrow = nsupr - nsupc;
+	lptr = xlsub[fsupc];
+	krep_ind = lptr + nsupc - 1;
+
+	repfnz_col = repfnz;
+	dense_col = dense;
+	
+	if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */
+
+	    TriTmp = tempv;
+	
+	    /* Sequence through each column in panel -- triangular solves */
+	    for (jj = jcol; jj < jcol + w; jj++,
+		 repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) {
+
+		kfnz = repfnz_col[krep];
+		if ( kfnz == EMPTY ) continue;	/* Skip any zero segment */
+	    
+		segsze = krep - kfnz + 1;
+		luptr = xlusup[fsupc];
+
+		ops[TRSV] += segsze * (segsze - 1);
+		ops[GEMV] += 2 * nrow * segsze;
+	
+		/* Case 1: Update U-segment of size 1 -- col-col update */
+		if ( segsze == 1 ) {
+		    ukj = dense_col[lsub[krep_ind]];
+		    luptr += nsupr*(nsupc-1) + nsupc;
+
+		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
+			irow = lsub[i];
+			dense_col[irow] -= ukj * lusup[luptr];
+			++luptr;
+		    }
+
+		} else if ( segsze <= 3 ) {
+		    ukj = dense_col[lsub[krep_ind]];
+		    ukj1 = dense_col[lsub[krep_ind - 1]];
+		    luptr += nsupr*(nsupc-1) + nsupc-1;
+		    luptr1 = luptr - nsupr;
+
+		    if ( segsze == 2 ) {
+			ukj -= ukj1 * lusup[luptr1];
+			dense_col[lsub[krep_ind]] = ukj;
+			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
+			    irow = lsub[i];
+			    luptr++; luptr1++;
+			    dense_col[irow] -= (ukj*lusup[luptr]
+						+ ukj1*lusup[luptr1]);
+			}
+		    } else {
+			ukj2 = dense_col[lsub[krep_ind - 2]];
+			luptr2 = luptr1 - nsupr;
+			ukj1 -= ukj2 * lusup[luptr2-1];
+			ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
+			dense_col[lsub[krep_ind]] = ukj;
+			dense_col[lsub[krep_ind-1]] = ukj1;
+			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
+			    irow = lsub[i];
+			    luptr++; luptr1++; luptr2++;
+			    dense_col[irow] -= ( ukj*lusup[luptr]
+                             + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
+			}
+		    }
+
+		} else  {	/* segsze >= 4 */
+		    
+		    /* Copy U[*,j] segment from dense[*] to TriTmp[*], which
+		       holds the result of triangular solves.    */
+		    no_zeros = kfnz - fsupc;
+		    isub = lptr + no_zeros;
+		    for (i = 0; i < segsze; ++i) {
+			irow = lsub[isub];
+			TriTmp[i] = dense_col[irow]; /* Gather */
+			++isub;
+		    }
+		    
+		    /* start effective triangle */
+		    luptr += nsupr * no_zeros + no_zeros;
+
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+		    STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
+			   &nsupr, TriTmp, &incx );
+#else
+		    strsv_( "L", "N", "U", &segsze, &lusup[luptr], 
+			   &nsupr, TriTmp, &incx );
+#endif
+#else		
+		    slsolve ( nsupr, segsze, &lusup[luptr], TriTmp );
+#endif
+		    
+
+		} /* else ... */
+	    
+	    }  /* for jj ... end tri-solves */
+
+	    /* Block row updates; push all the way into dense[*] block */
+	    for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) {
+		
+		r_hi = SUPERLU_MIN(nrow, r_ind + rowblk);
+		block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind);
+		luptr = xlusup[fsupc] + nsupc + r_ind;
+		isub1 = lptr + nsupc + r_ind;
+		
+		repfnz_col = repfnz;
+		TriTmp = tempv;
+		dense_col = dense;
+		
+		/* Sequence through each column in panel -- matrix-vector */
+		for (jj = jcol; jj < jcol + w; jj++,
+		     repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
+		    
+		    kfnz = repfnz_col[krep];
+		    if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
+		    
+		    segsze = krep - kfnz + 1;
+		    if ( segsze <= 3 ) continue;   /* skip unrolled cases */
+		    
+		    /* Perform a block update, and scatter the result of
+		       matrix-vector to dense[].		 */
+		    no_zeros = kfnz - fsupc;
+		    luptr1 = luptr + nsupr * no_zeros;
+		    MatvecTmp = &TriTmp[maxsuper];
+		    
+#ifdef USE_VENDOR_BLAS
+		    alpha = one; 
+                    beta = zero;
+#ifdef _CRAY
+		    SGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1], 
+			   &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
+#else
+		    sgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1], 
+			   &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
+#endif
+#else
+		    smatvec(nsupr, block_nrow, segsze, &lusup[luptr1],
+			   TriTmp, MatvecTmp);
+#endif
+		    
+		    /* Scatter MatvecTmp[*] into SPA dense[*] temporarily
+		     * such that MatvecTmp[*] can be re-used for the
+		     * the next blok row update. dense[] will be copied into 
+		     * global store after the whole panel has been finished.
+		     */
+		    isub = isub1;
+		    for (i = 0; i < block_nrow; i++) {
+			irow = lsub[isub];
+			dense_col[irow] -= MatvecTmp[i];
+			MatvecTmp[i] = zero;
+			++isub;
+		    }
+		    
+		} /* for jj ... */
+		
+	    } /* for each block row ... */
+	    
+	    /* Scatter the triangular solves into SPA dense[*] */
+	    repfnz_col = repfnz;
+	    TriTmp = tempv;
+	    dense_col = dense;
+	    
+	    for (jj = jcol; jj < jcol + w; jj++,
+		 repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
+		kfnz = repfnz_col[krep];
+		if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
+		
+		segsze = krep - kfnz + 1;
+		if ( segsze <= 3 ) continue; /* skip unrolled cases */
+		
+		no_zeros = kfnz - fsupc;		
+		isub = lptr + no_zeros;
+		for (i = 0; i < segsze; i++) {
+		    irow = lsub[isub];
+		    dense_col[irow] = TriTmp[i];
+		    TriTmp[i] = zero;
+		    ++isub;
+		}
+		
+	    } /* for jj ... */
+	    
+	} else { /* 1-D block modification */
+	    
+	    
+	    /* Sequence through each column in the panel */
+	    for (jj = jcol; jj < jcol + w; jj++,
+		 repfnz_col += m, dense_col += m) {
+		
+		kfnz = repfnz_col[krep];
+		if ( kfnz == EMPTY ) continue;	/* Skip any zero segment */
+		
+		segsze = krep - kfnz + 1;
+		luptr = xlusup[fsupc];
+
+		ops[TRSV] += segsze * (segsze - 1);
+		ops[GEMV] += 2 * nrow * segsze;
+		
+		/* Case 1: Update U-segment of size 1 -- col-col update */
+		if ( segsze == 1 ) {
+		    ukj = dense_col[lsub[krep_ind]];
+		    luptr += nsupr*(nsupc-1) + nsupc;
+
+		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
+			irow = lsub[i];
+			dense_col[irow] -= ukj * lusup[luptr];
+			++luptr;
+		    }
+
+		} else if ( segsze <= 3 ) {
+		    ukj = dense_col[lsub[krep_ind]];
+		    luptr += nsupr*(nsupc-1) + nsupc-1;
+		    ukj1 = dense_col[lsub[krep_ind - 1]];
+		    luptr1 = luptr - nsupr;
+
+		    if ( segsze == 2 ) {
+			ukj -= ukj1 * lusup[luptr1];
+			dense_col[lsub[krep_ind]] = ukj;
+			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
+			    irow = lsub[i];
+			    ++luptr;  ++luptr1;
+			    dense_col[irow] -= (ukj*lusup[luptr]
+						+ ukj1*lusup[luptr1]);
+			}
+		    } else {
+			ukj2 = dense_col[lsub[krep_ind - 2]];
+			luptr2 = luptr1 - nsupr;
+			ukj1 -= ukj2 * lusup[luptr2-1];
+			ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
+			dense_col[lsub[krep_ind]] = ukj;
+			dense_col[lsub[krep_ind-1]] = ukj1;
+			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
+			    irow = lsub[i];
+			    ++luptr; ++luptr1; ++luptr2;
+			    dense_col[irow] -= ( ukj*lusup[luptr]
+                             + ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
+			}
+		    }
+
+		} else  { /* segsze >= 4 */
+		    /* 
+		     * Perform a triangular solve and block update,
+		     * then scatter the result of sup-col update to dense[].
+		     */
+		    no_zeros = kfnz - fsupc;
+		    
+		    /* Copy U[*,j] segment from dense[*] to tempv[*]: 
+		     *    The result of triangular solve is in tempv[*];
+		     *    The result of matrix vector update is in dense_col[*]
+		     */
+		    isub = lptr + no_zeros;
+		    for (i = 0; i < segsze; ++i) {
+			irow = lsub[isub];
+			tempv[i] = dense_col[irow]; /* Gather */
+			++isub;
+		    }
+		    
+		    /* start effective triangle */
+		    luptr += nsupr * no_zeros + no_zeros;
+		    
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+		    STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
+			   &nsupr, tempv, &incx );
+#else
+		    strsv_( "L", "N", "U", &segsze, &lusup[luptr], 
+			   &nsupr, tempv, &incx );
+#endif
+		    
+		    luptr += segsze;	/* Dense matrix-vector */
+		    tempv1 = &tempv[segsze];
+                    alpha = one;
+                    beta = zero;
+#ifdef _CRAY
+		    SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], 
+			   &nsupr, tempv, &incx, &beta, tempv1, &incy );
+#else
+		    sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], 
+			   &nsupr, tempv, &incx, &beta, tempv1, &incy );
+#endif
+#else
+		    slsolve ( nsupr, segsze, &lusup[luptr], tempv );
+		    
+		    luptr += segsze;        /* Dense matrix-vector */
+		    tempv1 = &tempv[segsze];
+		    smatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1);
+#endif
+		    
+		    /* Scatter tempv[*] into SPA dense[*] temporarily, such
+		     * that tempv[*] can be used for the triangular solve of
+		     * the next column of the panel. They will be copied into 
+		     * ucol[*] after the whole panel has been finished.
+		     */
+		    isub = lptr + no_zeros;
+		    for (i = 0; i < segsze; i++) {
+			irow = lsub[isub];
+			dense_col[irow] = tempv[i];
+			tempv[i] = zero;
+			isub++;
+		    }
+		    
+		    /* Scatter the update from tempv1[*] into SPA dense[*] */
+		    /* Start dense rectangular L */
+		    for (i = 0; i < nrow; i++) {
+			irow = lsub[isub];
+			dense_col[irow] -= tempv1[i];
+			tempv1[i] = zero;
+			++isub;	
+		    }
+		    
+		} /* else segsze>=4 ... */
+		
+	    } /* for each column in the panel... */
+	    
+	} /* else 1-D update ... */
+
+    } /* for each updating supernode ... */
+
+}
+
+
+
diff --git a/intern/opennl/superlu/spanel_dfs.c b/intern/opennl/superlu/spanel_dfs.c
new file mode 100644
index 00000000000..7f5f3c7532a
--- /dev/null
+++ b/intern/opennl/superlu/spanel_dfs.c
@@ -0,0 +1,249 @@
+
+
+/*
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ */
+/*
+  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ 
+  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ 
+  Permission is hereby granted to use or copy this program for any
+  purpose, provided the above notices are retained on all copies.
+  Permission to modify the code and to distribute modified code is
+  granted, provided the above notices are retained, and a notice that
+  the code was modified is included with the above copyright notice.
+*/
+
+#include "ssp_defs.h"
+#include "util.h"
+
+void
+spanel_dfs (
+	   const int  m,           /* in - number of rows in the matrix */
+	   const int  w,           /* in */
+	   const int  jcol,        /* in */
+	   SuperMatrix *A,       /* in - original matrix */
+	   int        *perm_r,     /* in */
+	   int        *nseg,	   /* out */
+	   float     *dense,      /* out */
+	   int        *panel_lsub, /* out */
+	   int        *segrep,     /* out */
+	   int        *repfnz,     /* out */
+	   int        *xprune,     /* out */
+	   int        *marker,     /* out */     
+	   int        *parent,     /* working array */
+	   int        *xplore,     /* working array */
+	   GlobalLU_t *Glu         /* modified */
+	   )
+{
+/*
+ * Purpose
+ * =======
+ *
+ *   Performs a symbolic factorization on a panel of columns [jcol, jcol+w).
+ *
+ *   A supernode representative is the last column of a supernode.
+ *   The nonzeros in U[*,j] are segments that end at supernodal
+ *   representatives.
+ *
+ *   The routine returns one list of the supernodal representatives
+ *   in topological order of the dfs that generates them. This list is
+ *   a superset of the topological order of each individual column within
+ *   the panel. 
+ *   The location of the first nonzero in each supernodal segment
+ *   (supernodal entry location) is also returned. Each column has a 
+ *   separate list for this purpose.
+ *
+ *   Two marker arrays are used for dfs:
+ *     marker[i] == jj, if i was visited during dfs of current column jj;
+ *     marker1[i] >= jcol, if i was visited by earlier columns in this panel;
+ *
+ *   marker: A-row --> A-row/col (0/1)
+ *   repfnz: SuperA-col --> PA-row
+ *   parent: SuperA-col --> SuperA-col
+ *   xplore: SuperA-col --> index to L-structure
+ *
+ */
+    NCPformat *Astore;
+    float    *a;
+    int       *asub;
+    int       *xa_begin, *xa_end;
+    int	      krep, chperm, chmark, chrep, oldrep, kchild, myfnz;
+    int       k, krow, kmark, kperm;
+    int       xdfs, maxdfs, kpar;
+    int       jj;	   /* index through each column in the panel */
+    int       *marker1;	   /* marker1[jj] >= jcol if vertex jj was visited 
+			      by a previous column within this panel.   */
+    int       *repfnz_col; /* start of each column in the panel */
+    float    *dense_col;  /* start of each column in the panel */
+    int       nextl_col;   /* next available position in panel_lsub[*,jj] */
+    int       *xsup, *supno;
+    int       *lsub, *xlsub;
+
+    /* Initialize pointers */
+    Astore     = A->Store;
+    a          = Astore->nzval;
+    asub       = Astore->rowind;
+    xa_begin   = Astore->colbeg;
+    xa_end     = Astore->colend;
+    marker1    = marker + m;
+    repfnz_col = repfnz;
+    dense_col  = dense;
+    *nseg      = 0;
+    xsup       = Glu->xsup;
+    supno      = Glu->supno;
+    lsub       = Glu->lsub;
+    xlsub      = Glu->xlsub;
+
+    /* For each column in the panel */
+    for (jj = jcol; jj < jcol + w; jj++) {
+	nextl_col = (jj - jcol) * m;
+
+#ifdef CHK_DFS
+	printf("\npanel col %d: ", jj);
+#endif
+
+	/* For each nonz in A[*,jj] do dfs */
+	for (k = xa_begin[jj]; k < xa_end[jj]; k++) {
+	    krow = asub[k];
+            dense_col[krow] = a[k];
+	    kmark = marker[krow];    	
+	    if ( kmark == jj ) 
+		continue;     /* krow visited before, go to the next nonzero */
+
+	    /* For each unmarked nbr krow of jj
+	     * krow is in L: place it in structure of L[*,jj]
+	     */
+	    marker[krow] = jj;
+	    kperm = perm_r[krow];
+	    
+	    if ( kperm == EMPTY ) {
+		panel_lsub[nextl_col++] = krow; /* krow is indexed into A */
+	    }
+	    /* 
+	     * krow is in U: if its supernode-rep krep
+	     * has been explored, update repfnz[*]
+	     */
+	    else {
+		
+		krep = xsup[supno[kperm]+1] - 1;
+		myfnz = repfnz_col[krep];
+		
+#ifdef CHK_DFS
+		printf("krep %d, myfnz %d, perm_r[%d] %d\n", krep, myfnz, krow, kperm);
+#endif
+		if ( myfnz != EMPTY ) {	/* Representative visited before */
+		    if ( myfnz > kperm ) repfnz_col[krep] = kperm;
+		    /* continue; */
+		}
+		else {
+		    /* Otherwise, perform dfs starting at krep */
+		    oldrep = EMPTY;
+		    parent[krep] = oldrep;
+		    repfnz_col[krep] = kperm;
+		    xdfs = xlsub[krep];
+		    maxdfs = xprune[krep];
+		    
+#ifdef CHK_DFS 
+		    printf("  xdfs %d, maxdfs %d: ", xdfs, maxdfs);
+		    for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]);
+		    printf("\n");
+#endif
+		    do {
+			/* 
+			 * For each unmarked kchild of krep 
+			 */
+			while ( xdfs < maxdfs ) {
+			    
+			    kchild = lsub[xdfs];
+			    xdfs++;
+			    chmark = marker[kchild];
+			    
+			    if ( chmark != jj ) { /* Not reached yet */
+				marker[kchild] = jj;
+				chperm = perm_r[kchild];
+			      
+				/* Case kchild is in L: place it in L[*,j] */
+				if ( chperm == EMPTY ) {
+				    panel_lsub[nextl_col++] = kchild;
+				} 
+				/* Case kchild is in U: 
+				 *   chrep = its supernode-rep. If its rep has 
+				 *   been explored, update its repfnz[*]
+				 */
+				else {
+				    
+				    chrep = xsup[supno[chperm]+1] - 1;
+				    myfnz = repfnz_col[chrep];
+#ifdef CHK_DFS
+				    printf("chrep %d,myfnz %d,perm_r[%d] %d\n",chrep,myfnz,kchild,chperm);
+#endif
+				    if ( myfnz != EMPTY ) { /* Visited before */
+					if ( myfnz > chperm )
+					    repfnz_col[chrep] = chperm;
+				    }
+				    else {
+					/* Cont. dfs at snode-rep of kchild */
+					xplore[krep] = xdfs;	
+					oldrep = krep;
+					krep = chrep; /* Go deeper down G(L) */
+					parent[krep] = oldrep;
+					repfnz_col[krep] = chperm;
+					xdfs = xlsub[krep];     
+					maxdfs = xprune[krep];
+#ifdef CHK_DFS 
+					printf("  xdfs %d, maxdfs %d: ", xdfs, maxdfs);
+					for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]);	
+					printf("\n");
+#endif
+				    } /* else */
+				  
+				} /* else */
+			      
+			    } /* if... */
+			    
+			} /* while xdfs < maxdfs */
+			
+			/* krow has no more unexplored nbrs:
+			 *    Place snode-rep krep in postorder DFS, if this 
+			 *    segment is seen for the first time. (Note that
+			 *    "repfnz[krep]" may change later.)
+			 *    Backtrack dfs to its parent.
+			 */
+			if ( marker1[krep] < jcol ) {
+			    segrep[*nseg] = krep;
+			    ++(*nseg);
+			    marker1[krep] = jj;
+			}
+			
+			kpar = parent[krep]; /* Pop stack, mimic recursion */
+			if ( kpar == EMPTY ) break; /* dfs done */
+			krep = kpar;
+			xdfs = xplore[krep];
+			maxdfs = xprune[krep];
+			
+#ifdef CHK_DFS 
+			printf("  pop stack: krep %d,xdfs %d,maxdfs %d: ", krep,xdfs,maxdfs);
+			for (i = xdfs; i < maxdfs; i++) printf(" %d", lsub[i]);
+			printf("\n");
+#endif
+		    } while ( kpar != EMPTY ); /* do-while - until empty stack */
+		    
+		} /* else */
+		
+	    } /* else */
+	    
+	} /* for each nonz in A[*,jj] */
+	
+	repfnz_col += m;    /* Move to next column */
+        dense_col += m;
+	
+    } /* for jj ... */
+    
+}
diff --git a/intern/opennl/superlu/spivotL.c b/intern/opennl/superlu/spivotL.c
new file mode 100644
index 00000000000..6243065bb5b
--- /dev/null
+++ b/intern/opennl/superlu/spivotL.c
@@ -0,0 +1,173 @@
+
+/*
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ */
+/*
+  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ 
+  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ 
+  Permission is hereby granted to use or copy this program for any
+  purpose, provided the above notices are retained on all copies.
+  Permission to modify the code and to distribute modified code is
+  granted, provided the above notices are retained, and a notice that
+  the code was modified is included with the above copyright notice.
+*/
+
+#include <math.h>
+#include <stdlib.h>
+#include "ssp_defs.h"
+
+#undef DEBUG
+
+int
+spivotL(
+        const int  jcol,     /* in */
+        const float u,      /* in - diagonal pivoting threshold */
+        int        *usepr,   /* re-use the pivot sequence given by perm_r/iperm_r */
+        int        *perm_r,  /* may be modified */
+        int        *iperm_r, /* in - inverse of perm_r */
+        int        *iperm_c, /* in - used to find diagonal of Pc*A*Pc' */
+        int        *pivrow,  /* out */
+        GlobalLU_t *Glu,     /* modified - global LU data structures */
+	SuperLUStat_t *stat  /* output */
+       )
+{
+/*
+ * Purpose
+ * =======
+ *   Performs the numerical pivoting on the current column of L,
+ *   and the CDIV operation.
+ *
+ *   Pivot policy:
+ *   (1) Compute thresh = u * max_(i>=j) abs(A_ij);
+ *   (2) IF user specifies pivot row k and abs(A_kj) >= thresh THEN
+ *           pivot row = k;
+ *       ELSE IF abs(A_jj) >= thresh THEN
+ *           pivot row = j;
+ *       ELSE
+ *           pivot row = m;
+ * 
+ *   Note: If you absolutely want to use a given pivot order, then set u=0.0.
+ *
+ *   Return value: 0      success;
+ *                 i > 0  U(i,i) is exactly zero.
+ *
+ */
+    int          fsupc;	    /* first column in the supernode */
+    int          nsupc;	    /* no of columns in the supernode */
+    int          nsupr;     /* no of rows in the supernode */
+    int          lptr;	    /* points to the starting subscript of the supernode */
+    int          pivptr, old_pivptr, diag, diagind;
+    float       pivmax, rtemp, thresh;
+    float       temp;
+    float       *lu_sup_ptr; 
+    float       *lu_col_ptr;
+    int          *lsub_ptr;
+    int          isub, icol, k, itemp;
+    int          *lsub, *xlsub;
+    float       *lusup;
+    int          *xlusup;
+    flops_t      *ops = stat->ops;
+
+    /* Initialize pointers */
+    lsub       = Glu->lsub;
+    xlsub      = Glu->xlsub;
+    lusup      = Glu->lusup;
+    xlusup     = Glu->xlusup;
+    fsupc      = (Glu->xsup)[(Glu->supno)[jcol]];
+    nsupc      = jcol - fsupc;	        /* excluding jcol; nsupc >= 0 */
+    lptr       = xlsub[fsupc];
+    nsupr      = xlsub[fsupc+1] - lptr;
+    lu_sup_ptr = &lusup[xlusup[fsupc]];	/* start of the current supernode */
+    lu_col_ptr = &lusup[xlusup[jcol]];	/* start of jcol in the supernode */
+    lsub_ptr   = &lsub[lptr];	/* start of row indices of the supernode */
+
+#ifdef DEBUG
+if ( jcol == MIN_COL ) {
+    printf("Before cdiv: col %d\n", jcol);
+    for (k = nsupc; k < nsupr; k++) 
+	printf("  lu[%d] %f\n", lsub_ptr[k], lu_col_ptr[k]);
+}
+#endif
+    
+    /* Determine the largest abs numerical value for partial pivoting;
+       Also search for user-specified pivot, and diagonal element. */
+    if ( *usepr ) *pivrow = iperm_r[jcol];
+    diagind = iperm_c[jcol];
+    pivmax = 0.0;
+    pivptr = nsupc;
+    diag = EMPTY;
+    old_pivptr = nsupc;
+    for (isub = nsupc; isub < nsupr; ++isub) {
+	rtemp = fabs (lu_col_ptr[isub]);
+	if ( rtemp > pivmax ) {
+	    pivmax = rtemp;
+	    pivptr = isub;
+	}
+	if ( *usepr && lsub_ptr[isub] == *pivrow ) old_pivptr = isub;
+	if ( lsub_ptr[isub] == diagind ) diag = isub;
+    }
+
+    /* Test for singularity */
+    if ( pivmax == 0.0 ) {
+	*pivrow = lsub_ptr[pivptr];
+	perm_r[*pivrow] = jcol;
+	*usepr = 0;
+	return (jcol+1);
+    }
+
+    thresh = u * pivmax;
+    
+    /* Choose appropriate pivotal element by our policy. */
+    if ( *usepr ) {
+        rtemp = fabs (lu_col_ptr[old_pivptr]);
+	if ( rtemp != 0.0 && rtemp >= thresh )
+	    pivptr = old_pivptr;
+	else
+	    *usepr = 0;
+    }
+    if ( *usepr == 0 ) {
+	/* Use diagonal pivot? */
+	if ( diag >= 0 ) { /* diagonal exists */
+	    rtemp = fabs (lu_col_ptr[diag]);
+	    if ( rtemp != 0.0 && rtemp >= thresh ) pivptr = diag;
+        }
+	*pivrow = lsub_ptr[pivptr];
+    }
+    
+    /* Record pivot row */
+    perm_r[*pivrow] = jcol;
+    
+    /* Interchange row subscripts */
+    if ( pivptr != nsupc ) {
+	itemp = lsub_ptr[pivptr];
+	lsub_ptr[pivptr] = lsub_ptr[nsupc];
+	lsub_ptr[nsupc] = itemp;
+
+	/* Interchange numerical values as well, for the whole snode, such 
+	 * that L is indexed the same way as A.
+ 	 */
+	for (icol = 0; icol <= nsupc; icol++) {
+	    itemp = pivptr + icol * nsupr;
+	    temp = lu_sup_ptr[itemp];
+	    lu_sup_ptr[itemp] = lu_sup_ptr[nsupc + icol*nsupr];
+	    lu_sup_ptr[nsupc + icol*nsupr] = temp;
+	}
+    } /* if */
+
+    /* cdiv operation */
+    ops[FACT] += nsupr - nsupc;
+
+    temp = 1.0 / lu_col_ptr[nsupc];
+    for (k = nsupc+1; k < nsupr; k++) 
+	lu_col_ptr[k] *= temp;
+
+    return 0;
+}
+
diff --git a/intern/opennl/superlu/spruneL.c b/intern/opennl/superlu/spruneL.c
new file mode 100644
index 00000000000..59702706375
--- /dev/null
+++ b/intern/opennl/superlu/spruneL.c
@@ -0,0 +1,149 @@
+
+
+/*
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ */
+/*
+  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ 
+  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ 
+  Permission is hereby granted to use or copy this program for any
+  purpose, provided the above notices are retained on all copies.
+  Permission to modify the code and to distribute modified code is
+  granted, provided the above notices are retained, and a notice that
+  the code was modified is included with the above copyright notice.
+*/
+
+#include "ssp_defs.h"
+#include "util.h"
+
+void
+spruneL(
+       const int  jcol,	     /* in */
+       const int  *perm_r,   /* in */
+       const int  pivrow,    /* in */
+       const int  nseg,	     /* in */
+       const int  *segrep,   /* in */
+       const int  *repfnz,   /* in */
+       int        *xprune,   /* out */
+       GlobalLU_t *Glu       /* modified - global LU data structures */
+       )
+{
+/*
+ * Purpose
+ * =======
+ *   Prunes the L-structure of supernodes whose L-structure
+ *   contains the current pivot row "pivrow"
+ *
+ */
+    float     utemp;
+    int        jsupno, irep, irep1, kmin, kmax, krow, movnum;
+    int        i, ktemp, minloc, maxloc;
+    int        do_prune; /* logical variable */
+    int        *xsup, *supno;
+    int        *lsub, *xlsub;
+    float     *lusup;
+    int        *xlusup;
+
+    xsup       = Glu->xsup;
+    supno      = Glu->supno;
+    lsub       = Glu->lsub;
+    xlsub      = Glu->xlsub;
+    lusup      = Glu->lusup;
+    xlusup     = Glu->xlusup;
+    
+    /*
+     * For each supernode-rep irep in U[*,j]
+     */
+    jsupno = supno[jcol];
+    for (i = 0; i < nseg; i++) {
+
+	irep = segrep[i];
+	irep1 = irep + 1;
+	do_prune = FALSE;
+
+	/* Don't prune with a zero U-segment */
+ 	if ( repfnz[irep] == EMPTY )
+		continue;
+
+     	/* If a snode overlaps with the next panel, then the U-segment 
+   	 * is fragmented into two parts -- irep and irep1. We should let
+	 * pruning occur at the rep-column in irep1's snode. 
+	 */
+	if ( supno[irep] == supno[irep1] ) 	/* Don't prune */
+		continue;
+
+	/*
+	 * If it has not been pruned & it has a nonz in row L[pivrow,i]
+	 */
+	if ( supno[irep] != jsupno ) {
+	    if ( xprune[irep] >= xlsub[irep1] ) {
+		kmin = xlsub[irep];
+		kmax = xlsub[irep1] - 1;
+		for (krow = kmin; krow <= kmax; krow++) 
+		    if ( lsub[krow] == pivrow ) {
+			do_prune = TRUE;
+			break;
+		    }
+	    }
+	    
+    	    if ( do_prune ) {
+
+	     	/* Do a quicksort-type partition
+	     	 * movnum=TRUE means that the num values have to be exchanged.
+	     	 */
+	        movnum = FALSE;
+	        if ( irep == xsup[supno[irep]] ) /* Snode of size 1 */
+			movnum = TRUE;
+
+	        while ( kmin <= kmax ) {
+
+	    	    if ( perm_r[lsub[kmax]] == EMPTY ) 
+			kmax--;
+		    else if ( perm_r[lsub[kmin]] != EMPTY )
+			kmin++;
+		    else { /* kmin below pivrow, and kmax above pivrow: 
+		            * 	interchange the two subscripts
+			    */
+		        ktemp = lsub[kmin];
+		        lsub[kmin] = lsub[kmax];
+		        lsub[kmax] = ktemp;
+
+			/* If the supernode has only one column, then we
+ 			 * only keep one set of subscripts. For any subscript 
+			 * interchange performed, similar interchange must be 
+			 * done on the numerical values.
+ 			 */
+		        if ( movnum ) {
+		    	    minloc = xlusup[irep] + (kmin - xlsub[irep]);
+		    	    maxloc = xlusup[irep] + (kmax - xlsub[irep]);
+			    utemp = lusup[minloc];
+		  	    lusup[minloc] = lusup[maxloc];
+			    lusup[maxloc] = utemp;
+		        }
+
+		        kmin++;
+		        kmax--;
+
+		    }
+
+	        } /* while */
+
+	        xprune[irep] = kmin;	/* Pruning */
+
+#ifdef CHK_PRUNE
+	printf("    After spruneL(),using col %d:  xprune[%d] = %d\n", 
+			jcol, irep, kmin);
+#endif
+	    } /* if do_prune */
+
+	} /* if */
+
+    } /* for each U-segment... */
+}
diff --git a/intern/opennl/superlu/ssnode_bmod.c b/intern/opennl/superlu/ssnode_bmod.c
new file mode 100644
index 00000000000..fe97abd9ff6
--- /dev/null
+++ b/intern/opennl/superlu/ssnode_bmod.c
@@ -0,0 +1,117 @@
+
+/*
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ */
+/*
+  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ 
+  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ 
+  Permission is hereby granted to use or copy this program for any
+  purpose, provided the above notices are retained on all copies.
+  Permission to modify the code and to distribute modified code is
+  granted, provided the above notices are retained, and a notice that
+  the code was modified is included with the above copyright notice.
+*/
+
+#include "ssp_defs.h"
+
+void slsolve(int, int, float*, float*);
+void smatvec(int, int, int, float*, float*, float*);
+
+/*
+ * Performs numeric block updates within the relaxed snode. 
+ */
+int
+ssnode_bmod (
+	    const int  jcol,	  /* in */
+	    const int  fsupc,     /* in */
+	    float     *dense,    /* in */
+	    float     *tempv,    /* working array */
+	    GlobalLU_t *Glu,      /* modified */
+	    SuperLUStat_t *stat   /* output */
+	    )
+{
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+    _fcd ftcs1 = _cptofcd("L", strlen("L")),
+	 ftcs2 = _cptofcd("N", strlen("N")),
+	 ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+    int            incx = 1, incy = 1;
+    float         alpha = -1.0, beta = 1.0;
+#endif
+
+    int            luptr, nsupc, nsupr, nrow;
+    int            isub, irow, i, iptr; 
+    register int   ufirst, nextlu;
+    int            *lsub, *xlsub;
+    float         *lusup;
+    int            *xlusup;
+    flops_t *ops = stat->ops;
+
+    lsub    = Glu->lsub;
+    xlsub   = Glu->xlsub;
+    lusup   = Glu->lusup;
+    xlusup  = Glu->xlusup;
+
+    nextlu = xlusup[jcol];
+    
+    /*
+     *	Process the supernodal portion of L\U[*,j]
+     */
+    for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) {
+  	irow = lsub[isub];
+	lusup[nextlu] = dense[irow];
+	dense[irow] = 0;
+	++nextlu;
+    }
+
+    xlusup[jcol + 1] = nextlu;	/* Initialize xlusup for next column */
+    
+    if ( fsupc < jcol ) {
+
+	luptr = xlusup[fsupc];
+	nsupr = xlsub[fsupc+1] - xlsub[fsupc];
+	nsupc = jcol - fsupc;	/* Excluding jcol */
+	ufirst = xlusup[jcol];	/* Points to the beginning of column
+				   jcol in supernode L\U(jsupno). */
+	nrow = nsupr - nsupc;
+
+	ops[TRSV] += nsupc * (nsupc - 1);
+	ops[GEMV] += 2 * nrow * nsupc;
+
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+	STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr], &nsupr, 
+	      &lusup[ufirst], &incx );
+	SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, 
+		&lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
+#else
+	strsv_( "L", "N", "U", &nsupc, &lusup[luptr], &nsupr, 
+	      &lusup[ufirst], &incx );
+	sgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr, 
+		&lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
+#endif
+#else
+	slsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] );
+	smatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc], 
+			&lusup[ufirst], &tempv[0] );
+
+        /* Scatter tempv[*] into lusup[*] */
+	iptr = ufirst + nsupc;
+	for (i = 0; i < nrow; i++) {
+	    lusup[iptr++] -= tempv[i];
+	    tempv[i] = 0.0;
+	}
+#endif
+
+    }
+
+    return 0;
+}
diff --git a/intern/opennl/superlu/ssnode_dfs.c b/intern/opennl/superlu/ssnode_dfs.c
new file mode 100644
index 00000000000..c8974237a9a
--- /dev/null
+++ b/intern/opennl/superlu/ssnode_dfs.c
@@ -0,0 +1,106 @@
+
+
+/*
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ */
+/*
+  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ 
+  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ 
+  Permission is hereby granted to use or copy this program for any
+  purpose, provided the above notices are retained on all copies.
+  Permission to modify the code and to distribute modified code is
+  granted, provided the above notices are retained, and a notice that
+  the code was modified is included with the above copyright notice.
+*/
+
+#include "ssp_defs.h"
+#include "util.h"
+
+int
+ssnode_dfs (
+	   const int  jcol,	    /* in - start of the supernode */
+	   const int  kcol, 	    /* in - end of the supernode */
+	   const int  *asub,        /* in */
+	   const int  *xa_begin,    /* in */
+	   const int  *xa_end,      /* in */
+	   int        *xprune,      /* out */
+	   int        *marker,      /* modified */
+	   GlobalLU_t *Glu          /* modified */
+	   )
+{
+/* Purpose
+ * =======
+ *    ssnode_dfs() - Determine the union of the row structures of those 
+ *    columns within the relaxed snode.
+ *    Note: The relaxed snodes are leaves of the supernodal etree, therefore, 
+ *    the portion outside the rectangular supernode must be zero.
+ *
+ * Return value
+ * ============
+ *     0   success;
+ *    >0   number of bytes allocated when run out of memory.
+ *
+ */
+    register int i, k, ifrom, ito, nextl, new_next;
+    int          nsuper, krow, kmark, mem_error;
+    int          *xsup, *supno;
+    int          *lsub, *xlsub;
+    int          nzlmax;
+    
+    xsup    = Glu->xsup;
+    supno   = Glu->supno;
+    lsub    = Glu->lsub;
+    xlsub   = Glu->xlsub;
+    nzlmax  = Glu->nzlmax;
+
+    nsuper = ++supno[jcol];	/* Next available supernode number */
+    nextl = xlsub[jcol];
+
+    for (i = jcol; i <= kcol; i++) {
+	/* For each nonzero in A[*,i] */
+	for (k = xa_begin[i]; k < xa_end[i]; k++) {	
+	    krow = asub[k];
+	    kmark = marker[krow];
+	    if ( kmark != kcol ) { /* First time visit krow */
+		marker[krow] = kcol;
+		lsub[nextl++] = krow;
+		if ( nextl >= nzlmax ) {
+		    if ( (mem_error = sLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu)) )
+			return (mem_error);
+		    lsub = Glu->lsub;
+		}
+	    }
+    	}
+	supno[i] = nsuper;
+    }
+
+    /* Supernode > 1, then make a copy of the subscripts for pruning */
+    if ( jcol < kcol ) {
+	new_next = nextl + (nextl - xlsub[jcol]);
+	while ( new_next > nzlmax ) {
+	    if ( (mem_error = sLUMemXpand(jcol, nextl, LSUB, &nzlmax, Glu)) )
+		return (mem_error);
+	    lsub = Glu->lsub;
+	}
+	ito = nextl;
+	for (ifrom = xlsub[jcol]; ifrom < nextl; )
+	    lsub[ito++] = lsub[ifrom++];	
+        for (i = jcol+1; i <= kcol; i++) xlsub[i] = nextl;
+	nextl = ito;
+    }
+
+    xsup[nsuper+1] = kcol + 1;
+    supno[kcol+1]  = nsuper;
+    xprune[kcol]   = nextl;
+    xlsub[kcol+1]  = nextl;
+
+    return 0;
+}
+
diff --git a/intern/opennl/superlu/ssp_blas2.c b/intern/opennl/superlu/ssp_blas2.c
new file mode 100644
index 00000000000..347f9ab5fd4
--- /dev/null
+++ b/intern/opennl/superlu/ssp_blas2.c
@@ -0,0 +1,469 @@
+
+/*
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ */
+/*
+ * File name:		ssp_blas2.c
+ * Purpose:		Sparse BLAS 2, using some dense BLAS 2 operations.
+ */
+
+#include "ssp_defs.h"
+
+/* 
+ * Function prototypes 
+ */
+void susolve(int, int, float*, float*);
+void slsolve(int, int, float*, float*);
+void smatvec(int, int, int, float*, float*, float*);
+int strsv_(char*, char*, char*, int*, float*, int*, float*, int*);
+
+int
+sp_strsv(char *uplo, char *trans, char *diag, SuperMatrix *L, 
+         SuperMatrix *U, float *x, SuperLUStat_t *stat, int *info)
+{
+/*
+ *   Purpose
+ *   =======
+ *
+ *   sp_strsv() solves one of the systems of equations   
+ *       A*x = b,   or   A'*x = b,
+ *   where b and x are n element vectors and A is a sparse unit , or   
+ *   non-unit, upper or lower triangular matrix.   
+ *   No test for singularity or near-singularity is included in this   
+ *   routine. Such tests must be performed before calling this routine.   
+ *
+ *   Parameters   
+ *   ==========   
+ *
+ *   uplo   - (input) char*
+ *            On entry, uplo specifies whether the matrix is an upper or   
+ *             lower triangular matrix as follows:   
+ *                uplo = 'U' or 'u'   A is an upper triangular matrix.   
+ *                uplo = 'L' or 'l'   A is a lower triangular matrix.   
+ *
+ *   trans  - (input) char*
+ *             On entry, trans specifies the equations to be solved as   
+ *             follows:   
+ *                trans = 'N' or 'n'   A*x = b.   
+ *                trans = 'T' or 't'   A'*x = b.
+ *                trans = 'C' or 'c'   A'*x = b.   
+ *
+ *   diag   - (input) char*
+ *             On entry, diag specifies whether or not A is unit   
+ *             triangular as follows:   
+ *                diag = 'U' or 'u'   A is assumed to be unit triangular.   
+ *                diag = 'N' or 'n'   A is not assumed to be unit   
+ *                                    triangular.   
+ *	     
+ *   L       - (input) SuperMatrix*
+ *	       The factor L from the factorization Pr*A*Pc=L*U. Use
+ *             compressed row subscripts storage for supernodes,
+ *             i.e., L has types: Stype = SC, Dtype = SLU_S, Mtype = TRLU.
+ *
+ *   U       - (input) SuperMatrix*
+ *	        The factor U from the factorization Pr*A*Pc=L*U.
+ *	        U has types: Stype = NC, Dtype = SLU_S, Mtype = TRU.
+ *    
+ *   x       - (input/output) float*
+ *             Before entry, the incremented array X must contain the n   
+ *             element right-hand side vector b. On exit, X is overwritten 
+ *             with the solution vector x.
+ *
+ *   info    - (output) int*
+ *             If *info = -i, the i-th argument had an illegal value.
+ *
+ */
+#ifdef _CRAY
+    _fcd ftcs1 = _cptofcd("L", strlen("L")),
+	 ftcs2 = _cptofcd("N", strlen("N")),
+	 ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+    SCformat *Lstore;
+    NCformat *Ustore;
+    float   *Lval, *Uval;
+    int incx = 1;
+    int nrow;
+    int fsupc, nsupr, nsupc, luptr, istart, irow;
+    int i, k, iptr, jcol;
+    float *work;
+    flops_t solve_ops;
+
+    /* Test the input parameters */
+    *info = 0;
+    if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1;
+    else if ( !lsame_(trans, "N") && !lsame_(trans, "T") && 
+              !lsame_(trans, "C")) *info = -2;
+    else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3;
+    else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4;
+    else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5;
+    if ( *info ) {
+	i = -(*info);
+	xerbla_("sp_strsv", &i);
+	return 0;
+    }
+
+    Lstore = L->Store;
+    Lval = Lstore->nzval;
+    Ustore = U->Store;
+    Uval = Ustore->nzval;
+    solve_ops = 0;
+
+    if ( !(work = floatCalloc(L->nrow)) )
+	ABORT("Malloc fails for work in sp_strsv().");
+    
+    if ( lsame_(trans, "N") ) {	/* Form x := inv(A)*x. */
+	
+	if ( lsame_(uplo, "L") ) {
+	    /* Form x := inv(L)*x */
+    	    if ( L->nrow == 0 ) return 0; /* Quick return */
+	    
+	    for (k = 0; k <= Lstore->nsuper; k++) {
+		fsupc = L_FST_SUPC(k);
+		istart = L_SUB_START(fsupc);
+		nsupr = L_SUB_START(fsupc+1) - istart;
+		nsupc = L_FST_SUPC(k+1) - fsupc;
+		luptr = L_NZ_START(fsupc);
+		nrow = nsupr - nsupc;
+
+	        solve_ops += nsupc * (nsupc - 1);
+	        solve_ops += 2 * nrow * nsupc;
+
+		if ( nsupc == 1 ) {
+		    for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) {
+			irow = L_SUB(iptr);
+			++luptr;
+			x[irow] -= x[fsupc] * Lval[luptr];
+		    }
+		} else {
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+		    STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
+		       	&x[fsupc], &incx);
+		
+		    SGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
+		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
+#else
+		    strsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr,
+		       	&x[fsupc], &incx);
+		
+		    sgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
+		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
+#endif
+#else
+		    slsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]);
+		
+		    smatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc],
+                             &x[fsupc], &work[0] );
+#endif		
+		
+		    iptr = istart + nsupc;
+		    for (i = 0; i < nrow; ++i, ++iptr) {
+			irow = L_SUB(iptr);
+			x[irow] -= work[i];	/* Scatter */
+			work[i] = 0.0;
+
+		    }
+	 	}
+	    } /* for k ... */
+	    
+	} else {
+	    /* Form x := inv(U)*x */
+	    
+	    if ( U->nrow == 0 ) return 0; /* Quick return */
+	    
+	    for (k = Lstore->nsuper; k >= 0; k--) {
+	    	fsupc = L_FST_SUPC(k);
+	    	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
+	    	nsupc = L_FST_SUPC(k+1) - fsupc;
+	    	luptr = L_NZ_START(fsupc);
+		
+    	        solve_ops += nsupc * (nsupc + 1);
+
+		if ( nsupc == 1 ) {
+		    x[fsupc] /= Lval[luptr];
+		    for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) {
+			irow = U_SUB(i);
+			x[irow] -= x[fsupc] * Uval[i];
+		    }
+		} else {
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+		    STRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr,
+		       &x[fsupc], &incx);
+#else
+		    strsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr,
+                           &x[fsupc], &incx);
+#endif
+#else		
+		    susolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] );
+#endif		
+
+		    for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
+		        solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
+		    	for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); 
+				i++) {
+			    irow = U_SUB(i);
+			    x[irow] -= x[jcol] * Uval[i];
+		    	}
+                    }
+		}
+	    } /* for k ... */
+	    
+	}
+    } else { /* Form x := inv(A')*x */
+	
+	if ( lsame_(uplo, "L") ) {
+	    /* Form x := inv(L')*x */
+    	    if ( L->nrow == 0 ) return 0; /* Quick return */
+	    
+	    for (k = Lstore->nsuper; k >= 0; --k) {
+	    	fsupc = L_FST_SUPC(k);
+	    	istart = L_SUB_START(fsupc);
+	    	nsupr = L_SUB_START(fsupc+1) - istart;
+	    	nsupc = L_FST_SUPC(k+1) - fsupc;
+	    	luptr = L_NZ_START(fsupc);
+
+		solve_ops += 2 * (nsupr - nsupc) * nsupc;
+
+		for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
+		    iptr = istart + nsupc;
+		    for (i = L_NZ_START(jcol) + nsupc; 
+				i < L_NZ_START(jcol+1); i++) {
+			irow = L_SUB(iptr);
+			x[jcol] -= x[irow] * Lval[i];
+			iptr++;
+		    }
+		}
+		
+		if ( nsupc > 1 ) {
+		    solve_ops += nsupc * (nsupc - 1);
+#ifdef _CRAY
+                    ftcs1 = _cptofcd("L", strlen("L"));
+                    ftcs2 = _cptofcd("T", strlen("T"));
+                    ftcs3 = _cptofcd("U", strlen("U"));
+		    STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
+			&x[fsupc], &incx);
+#else
+		    strsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr,
+			&x[fsupc], &incx);
+#endif
+		}
+	    }
+	} else {
+	    /* Form x := inv(U')*x */
+	    if ( U->nrow == 0 ) return 0; /* Quick return */
+	    
+	    for (k = 0; k <= Lstore->nsuper; k++) {
+	    	fsupc = L_FST_SUPC(k);
+	    	nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
+	    	nsupc = L_FST_SUPC(k+1) - fsupc;
+	    	luptr = L_NZ_START(fsupc);
+
+		for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
+		    solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
+		    for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) {
+			irow = U_SUB(i);
+			x[jcol] -= x[irow] * Uval[i];
+		    }
+		}
+
+		solve_ops += nsupc * (nsupc + 1);
+
+		if ( nsupc == 1 ) {
+		    x[fsupc] /= Lval[luptr];
+		} else {
+#ifdef _CRAY
+                    ftcs1 = _cptofcd("U", strlen("U"));
+                    ftcs2 = _cptofcd("T", strlen("T"));
+                    ftcs3 = _cptofcd("N", strlen("N"));
+		    STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
+			    &x[fsupc], &incx);
+#else
+		    strsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr,
+			    &x[fsupc], &incx);
+#endif
+		}
+	    } /* for k ... */
+	}
+    }
+
+    stat->ops[SOLVE] += solve_ops;
+    SUPERLU_FREE(work);
+    return 0;
+}
+
+
+
+
+int
+sp_sgemv(char *trans, float alpha, SuperMatrix *A, float *x, 
+	 int incx, float beta, float *y, int incy)
+{
+/*  Purpose   
+    =======   
+
+    sp_sgemv()  performs one of the matrix-vector operations   
+       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   
+    where alpha and beta are scalars, x and y are vectors and A is a
+    sparse A->nrow by A->ncol matrix.   
+
+    Parameters   
+    ==========   
+
+    TRANS  - (input) char*
+             On entry, TRANS specifies the operation to be performed as   
+             follows:   
+                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   
+                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
+                TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.   
+
+    ALPHA  - (input) float
+             On entry, ALPHA specifies the scalar alpha.   
+
+    A      - (input) SuperMatrix*
+             Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
+             Currently, the type of A can be:
+                 Stype = NC or NCP; Dtype = SLU_S; Mtype = GE. 
+             In the future, more general A can be handled.
+
+    X      - (input) float*, array of DIMENSION at least   
+             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
+             and at least   
+             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
+             Before entry, the incremented array X must contain the   
+             vector x.   
+
+    INCX   - (input) int
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+
+    BETA   - (input) float
+             On entry, BETA specifies the scalar beta. When BETA is   
+             supplied as zero then Y need not be set on input.   
+
+    Y      - (output) float*,  array of DIMENSION at least   
+             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
+             and at least   
+             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
+             Before entry with BETA non-zero, the incremented array Y   
+             must contain the vector y. On exit, Y is overwritten by the 
+             updated vector y.
+	     
+    INCY   - (input) int
+             On entry, INCY specifies the increment for the elements of   
+             Y. INCY must not be zero.   
+
+    ==== Sparse Level 2 Blas routine.   
+*/
+
+    /* Local variables */
+    NCformat *Astore;
+    float   *Aval;
+    int info;
+    float temp;
+    int lenx, leny, i, j, irow;
+    int iy, jx, jy, kx, ky;
+    int notran;
+
+    notran = lsame_(trans, "N");
+    Astore = A->Store;
+    Aval = Astore->nzval;
+    
+    /* Test the input parameters */
+    info = 0;
+    if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) info = 1;
+    else if ( A->nrow < 0 || A->ncol < 0 ) info = 3;
+    else if (incx == 0) info = 5;
+    else if (incy == 0)	info = 8;
+    if (info != 0) {
+	xerbla_("sp_sgemv ", &info);
+	return 0;
+    }
+
+    /* Quick return if possible. */
+    if (A->nrow == 0 || A->ncol == 0 || (alpha == 0. && beta == 1.))
+	return 0;
+
+    /* Set  LENX  and  LENY, the lengths of the vectors x and y, and set 
+       up the start points in  X  and  Y. */
+    if (lsame_(trans, "N")) {
+	lenx = A->ncol;
+	leny = A->nrow;
+    } else {
+	lenx = A->nrow;
+	leny = A->ncol;
+    }
+    if (incx > 0) kx = 0;
+    else kx =  - (lenx - 1) * incx;
+    if (incy > 0) ky = 0;
+    else ky =  - (leny - 1) * incy;
+
+    /* Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through A. */
+    /* First form  y := beta*y. */
+    if (beta != 1.) {
+	if (incy == 1) {
+	    if (beta == 0.)
+		for (i = 0; i < leny; ++i) y[i] = 0.;
+	    else
+		for (i = 0; i < leny; ++i) y[i] = beta * y[i];
+	} else {
+	    iy = ky;
+	    if (beta == 0.)
+		for (i = 0; i < leny; ++i) {
+		    y[iy] = 0.;
+		    iy += incy;
+		}
+	    else
+		for (i = 0; i < leny; ++i) {
+		    y[iy] = beta * y[iy];
+		    iy += incy;
+		}
+	}
+    }
+    
+    if (alpha == 0.) return 0;
+
+    if ( notran ) {
+	/* Form  y := alpha*A*x + y. */
+	jx = kx;
+	if (incy == 1) {
+	    for (j = 0; j < A->ncol; ++j) {
+		if (x[jx] != 0.) {
+		    temp = alpha * x[jx];
+		    for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
+			irow = Astore->rowind[i];
+			y[irow] += temp * Aval[i];
+		    }
+		}
+		jx += incx;
+	    }
+	} else {
+	    ABORT("Not implemented.");
+	}
+    } else {
+	/* Form  y := alpha*A'*x + y. */
+	jy = ky;
+	if (incx == 1) {
+	    for (j = 0; j < A->ncol; ++j) {
+		temp = 0.;
+		for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
+		    irow = Astore->rowind[i];
+		    temp += Aval[i] * x[irow];
+		}
+		y[jy] += alpha * temp;
+		jy += incy;
+	    }
+	} else {
+	    ABORT("Not implemented.");
+	}
+    }
+    return 0;
+} /* sp_sgemv */
+
+
+
diff --git a/intern/opennl/superlu/ssp_blas3.c b/intern/opennl/superlu/ssp_blas3.c
new file mode 100644
index 00000000000..19086077c4c
--- /dev/null
+++ b/intern/opennl/superlu/ssp_blas3.c
@@ -0,0 +1,121 @@
+
+
+/*
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ */
+/*
+ * File name:		sp_blas3.c
+ * Purpose:		Sparse BLAS3, using some dense BLAS3 operations.
+ */
+
+#include "ssp_defs.h"
+#include "util.h"
+
+int
+sp_sgemm(char *transa, int n, 
+         float alpha, SuperMatrix *A, float *b, int ldb, 
+         float beta, float *c, int ldc)
+{
+/*  Purpose   
+    =======   
+
+    sp_s performs one of the matrix-matrix operations   
+
+       C := alpha*op( A )*op( B ) + beta*C,   
+
+    where  op( X ) is one of 
+
+       op( X ) = X   or   op( X ) = X'   or   op( X ) = conjg( X' ),
+
+    alpha and beta are scalars, and A, B and C are matrices, with op( A ) 
+    an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix. 
+  
+
+    Parameters   
+    ==========   
+
+    TRANSA - (input) char*
+             On entry, TRANSA specifies the form of op( A ) to be used in 
+             the matrix multiplication as follows:   
+                TRANSA = 'N' or 'n',  op( A ) = A.   
+                TRANSA = 'T' or 't',  op( A ) = A'.   
+                TRANSA = 'C' or 'c',  op( A ) = conjg( A' ).   
+             Unchanged on exit.   
+
+    TRANSB - (input) char*
+             On entry, TRANSB specifies the form of op( B ) to be used in 
+             the matrix multiplication as follows:   
+                TRANSB = 'N' or 'n',  op( B ) = B.   
+                TRANSB = 'T' or 't',  op( B ) = B'.   
+                TRANSB = 'C' or 'c',  op( B ) = conjg( B' ).   
+             Unchanged on exit.   
+
+    M      - (input) int   
+             On entry,  M  specifies  the number of rows of the matrix 
+	     op( A ) and of the matrix C.  M must be at least zero. 
+	     Unchanged on exit.   
+
+    N      - (input) int
+             On entry,  N specifies the number of columns of the matrix 
+	     op( B ) and the number of columns of the matrix C. N must be 
+	     at least zero.
+	     Unchanged on exit.   
+
+    K      - (input) int
+             On entry, K specifies the number of columns of the matrix 
+	     op( A ) and the number of rows of the matrix op( B ). K must 
+	     be at least  zero.   
+             Unchanged on exit.
+	     
+    ALPHA  - (input) float
+             On entry, ALPHA specifies the scalar alpha.   
+
+    A      - (input) SuperMatrix*
+             Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
+             Currently, the type of A can be:
+                 Stype = NC or NCP; Dtype = SLU_S; Mtype = GE. 
+             In the future, more general A can be handled.
+
+    B      - FLOAT PRECISION array of DIMENSION ( LDB, kb ), where kb is 
+             n when TRANSB = 'N' or 'n',  and is  k otherwise.   
+             Before entry with  TRANSB = 'N' or 'n',  the leading k by n 
+             part of the array B must contain the matrix B, otherwise 
+             the leading n by k part of the array B must contain the 
+             matrix B.   
+             Unchanged on exit.   
+
+    LDB    - (input) int
+             On entry, LDB specifies the first dimension of B as declared 
+             in the calling (sub) program. LDB must be at least max( 1, n ).  
+             Unchanged on exit.   
+
+    BETA   - (input) float
+             On entry, BETA specifies the scalar beta. When BETA is   
+             supplied as zero then C need not be set on input.   
+
+    C      - FLOAT PRECISION array of DIMENSION ( LDC, n ).   
+             Before entry, the leading m by n part of the array C must 
+             contain the matrix C,  except when beta is zero, in which 
+             case C need not be set on entry.   
+             On exit, the array C is overwritten by the m by n matrix 
+	     ( alpha*op( A )*B + beta*C ).   
+
+    LDC    - (input) int
+             On entry, LDC specifies the first dimension of C as declared 
+             in the calling (sub)program. LDC must be at least max(1,m).   
+             Unchanged on exit.   
+
+    ==== Sparse Level 3 Blas routine.   
+*/
+    int    incx = 1, incy = 1;
+    int    j;
+
+    for (j = 0; j < n; ++j) {
+	sp_sgemv(transa, alpha, A, &b[ldb*j], incx, beta, &c[ldc*j], incy);
+    }
+    return 0;    
+}
diff --git a/intern/opennl/superlu/ssp_defs.h b/intern/opennl/superlu/ssp_defs.h
new file mode 100644
index 00000000000..5b4e86b175b
--- /dev/null
+++ b/intern/opennl/superlu/ssp_defs.h
@@ -0,0 +1,234 @@
+
+/*
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ */
+#ifndef __SUPERLU_sSP_DEFS /* allow multiple inclusions */
+#define __SUPERLU_sSP_DEFS
+
+/*
+ * File name:		ssp_defs.h
+ * Purpose:             Sparse matrix types and function prototypes
+ * History:
+ */
+
+#ifdef _CRAY
+#include <fortran.h>
+#include <string.h>
+#endif
+
+/* Define my integer type int_t */
+typedef int int_t; /* default */
+
+#include "Cnames.h"
+#include "supermatrix.h"
+#include "util.h"
+
+
+/*
+ * Global data structures used in LU factorization -
+ * 
+ *   nsuper: #supernodes = nsuper + 1, numbered [0, nsuper].
+ *   (xsup,supno): supno[i] is the supernode no to which i belongs;
+ *	xsup(s) points to the beginning of the s-th supernode.
+ *	e.g.   supno 0 1 2 2 3 3 3 4 4 4 4 4   (n=12)
+ *	        xsup 0 1 2 4 7 12
+ *	Note: dfs will be performed on supernode rep. relative to the new 
+ *	      row pivoting ordering
+ *
+ *   (xlsub,lsub): lsub[*] contains the compressed subscript of
+ *	rectangular supernodes; xlsub[j] points to the starting
+ *	location of the j-th column in lsub[*]. Note that xlsub 
+ *	is indexed by column.
+ *	Storage: original row subscripts
+ *
+ *      During the course of sparse LU factorization, we also use
+ *	(xlsub,lsub) for the purpose of symmetric pruning. For each
+ *	supernode {s,s+1,...,t=s+r} with first column s and last
+ *	column t, the subscript set
+ *		lsub[j], j=xlsub[s], .., xlsub[s+1]-1
+ *	is the structure of column s (i.e. structure of this supernode).
+ *	It is used for the storage of numerical values.
+ *	Furthermore,
+ *		lsub[j], j=xlsub[t], .., xlsub[t+1]-1
+ *	is the structure of the last column t of this supernode.
+ *	It is for the purpose of symmetric pruning. Therefore, the
+ *	structural subscripts can be rearranged without making physical
+ *	interchanges among the numerical values.
+ *
+ *	However, if the supernode has only one column, then we
+ *	only keep one set of subscripts. For any subscript interchange
+ *	performed, similar interchange must be done on the numerical
+ *	values.
+ *
+ *	The last column structures (for pruning) will be removed
+ *	after the numercial LU factorization phase.
+ *
+ *   (xlusup,lusup): lusup[*] contains the numerical values of the
+ *	rectangular supernodes; xlusup[j] points to the starting
+ *	location of the j-th column in storage vector lusup[*]
+ *	Note: xlusup is indexed by column.
+ *	Each rectangular supernode is stored by column-major
+ *	scheme, consistent with Fortran 2-dim array storage.
+ *
+ *   (xusub,ucol,usub): ucol[*] stores the numerical values of
+ *	U-columns outside the rectangular supernodes. The row
+ *	subscript of nonzero ucol[k] is stored in usub[k].
+ *	xusub[i] points to the starting location of column i in ucol.
+ *	Storage: new row subscripts; that is subscripts of PA.
+ */
+typedef struct {
+    int     *xsup;    /* supernode and column mapping */
+    int     *supno;   
+    int     *lsub;    /* compressed L subscripts */
+    int	    *xlsub;
+    float  *lusup;   /* L supernodes */
+    int     *xlusup;
+    float  *ucol;    /* U columns */
+    int     *usub;
+    int	    *xusub;
+    int     nzlmax;   /* current max size of lsub */
+    int     nzumax;   /*    "    "    "      ucol */
+    int     nzlumax;  /*    "    "    "     lusup */
+    int     n;        /* number of columns in the matrix */
+    LU_space_t MemModel; /* 0 - system malloc'd; 1 - user provided */
+} GlobalLU_t;
+
+typedef struct {
+    float for_lu;
+    float total_needed;
+    int   expansions;
+} mem_usage_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Driver routines */
+extern void
+sgssv(superlu_options_t *, SuperMatrix *, int *, int *, SuperMatrix *,
+      SuperMatrix *, SuperMatrix *, SuperLUStat_t *, int *);
+extern void
+sgssvx(superlu_options_t *, SuperMatrix *, int *, int *, int *,
+       char *, float *, float *, SuperMatrix *, SuperMatrix *,
+       void *, int, SuperMatrix *, SuperMatrix *,
+       float *, float *, float *, float *,
+       mem_usage_t *, SuperLUStat_t *, int *);
+
+/* Supernodal LU factor related */
+extern void
+sCreate_CompCol_Matrix(SuperMatrix *, int, int, int, float *,
+		       int *, int *, Stype_t, Dtype_t, Mtype_t);
+extern void
+sCreate_CompRow_Matrix(SuperMatrix *, int, int, int, float *,
+		       int *, int *, Stype_t, Dtype_t, Mtype_t);
+extern void
+sCopy_CompCol_Matrix(SuperMatrix *, SuperMatrix *);
+extern void
+sCreate_Dense_Matrix(SuperMatrix *, int, int, float *, int,
+		     Stype_t, Dtype_t, Mtype_t);
+extern void
+sCreate_SuperNode_Matrix(SuperMatrix *, int, int, int, float *, 
+		         int *, int *, int *, int *, int *,
+			 Stype_t, Dtype_t, Mtype_t);
+extern void
+sCopy_Dense_Matrix(int, int, float *, int, float *, int);
+
+extern void    countnz (const int, int *, int *, int *, GlobalLU_t *);
+extern void    fixupL (const int, const int *, GlobalLU_t *);
+
+extern void    sallocateA (int, int, float **, int **, int **);
+extern void    sgstrf (superlu_options_t*, SuperMatrix*, 
+                       int, int, int*, void *, int, int *, int *, 
+                       SuperMatrix *, SuperMatrix *, SuperLUStat_t*, int *);
+extern int     ssnode_dfs (const int, const int, const int *, const int *,
+			     const int *, int *, int *, GlobalLU_t *);
+extern int     ssnode_bmod (const int, const int, float *,
+                              float *, GlobalLU_t *, SuperLUStat_t*);
+extern void    spanel_dfs (const int, const int, const int, SuperMatrix *,
+			   int *, int *, float *, int *, int *, int *,
+			   int *, int *, int *, int *, GlobalLU_t *);
+extern void    spanel_bmod (const int, const int, const int, const int,
+                           float *, float *, int *, int *,
+			   GlobalLU_t *, SuperLUStat_t*);
+extern int     scolumn_dfs (const int, const int, int *, int *, int *, int *,
+			   int *, int *, int *, int *, int *, GlobalLU_t *);
+extern int     scolumn_bmod (const int, const int, float *,
+			   float *, int *, int *, int,
+                           GlobalLU_t *, SuperLUStat_t*);
+extern int     scopy_to_ucol (int, int, int *, int *, int *,
+                              float *, GlobalLU_t *);         
+extern int     spivotL (const int, const float, int *, int *, 
+                         int *, int *, int *, GlobalLU_t *, SuperLUStat_t*);
+extern void    spruneL (const int, const int *, const int, const int,
+			  const int *, const int *, int *, GlobalLU_t *);
+extern void    sreadmt (int *, int *, int *, float **, int **, int **);
+extern void    sGenXtrue (int, int, float *, int);
+extern void    sFillRHS (trans_t, int, float *, int, SuperMatrix *,
+			  SuperMatrix *);
+extern void    sgstrs (trans_t, SuperMatrix *, SuperMatrix *, int *, int *,
+                        SuperMatrix *, SuperLUStat_t*, int *);
+
+
+/* Driver related */
+
+extern void    sgsequ (SuperMatrix *, float *, float *, float *,
+			float *, float *, int *);
+extern void    slaqgs (SuperMatrix *, float *, float *, float,
+                        float, float, char *);
+extern void    sgscon (char *, SuperMatrix *, SuperMatrix *, 
+		         float, float *, SuperLUStat_t*, int *);
+extern float   sPivotGrowth(int, SuperMatrix *, int *, 
+                            SuperMatrix *, SuperMatrix *);
+extern void    sgsrfs (trans_t, SuperMatrix *, SuperMatrix *,
+                       SuperMatrix *, int *, int *, char *, float *, 
+                       float *, SuperMatrix *, SuperMatrix *,
+                       float *, float *, SuperLUStat_t*, int *);
+
+extern int     sp_strsv (char *, char *, char *, SuperMatrix *,
+			SuperMatrix *, float *, SuperLUStat_t*, int *);
+extern int     sp_sgemv (char *, float, SuperMatrix *, float *,
+			int, float, float *, int);
+
+extern int     sp_sgemm (char *, int, float,
+			SuperMatrix *, float *, int, float, 
+			float *, int);
+
+/* Memory-related */
+extern int     sLUMemInit (fact_t, void *, int, int, int, int, int,
+			     SuperMatrix *, SuperMatrix *,
+			     GlobalLU_t *, int **, float **);
+extern void    sSetRWork (int, int, float *, float **, float **);
+extern void    sLUWorkFree (int *, float *, GlobalLU_t *);
+extern int     sLUMemXpand (int, int, MemType, int *, GlobalLU_t *);
+
+extern float  *floatMalloc(int);
+extern float  *floatCalloc(int);
+extern int     smemory_usage(const int, const int, const int, const int);
+extern int     sQuerySpace (SuperMatrix *, SuperMatrix *, mem_usage_t *);
+
+/* Auxiliary routines */
+extern void    sreadhb(int *, int *, int *, float **, int **, int **);
+extern void    sCompRow_to_CompCol(int, int, int, float*, int*, int*,
+		                   float **, int **, int **);
+extern void    sfill (float *, int, float);
+extern void    sinf_norm_error (int, SuperMatrix *, float *);
+extern void    PrintPerf (SuperMatrix *, SuperMatrix *, mem_usage_t *,
+			 float, float, float *, float *, char *);
+
+/* Routines for debugging */
+extern void    sPrint_CompCol_Matrix(char *, SuperMatrix *);
+extern void    sPrint_SuperNode_Matrix(char *, SuperMatrix *);
+extern void    sPrint_Dense_Matrix(char *, SuperMatrix *);
+extern void    print_lu_col(char *, int, int, int *, GlobalLU_t *);
+extern void    check_tempv(int, float *);
+
+#ifdef __cplusplus
+  }
+#endif
+
+#endif /* __SUPERLU_sSP_DEFS */
+
diff --git a/intern/opennl/superlu/strsv.c b/intern/opennl/superlu/strsv.c
new file mode 100644
index 00000000000..2f6a92c0d0d
--- /dev/null
+++ b/intern/opennl/superlu/strsv.c
@@ -0,0 +1,331 @@
+
+/* Subroutine */ int strsv_(char *uplo, char *trans, char *diag, int *n, 
+	float *a, int *lda, float *x, int *incx)
+{
+
+
+    /* System generated locals */
+    int i__1, i__2;
+
+    /* Local variables */
+    static int info;
+    static float temp;
+    static int i, j;
+    extern int lsame_(char *, char *);
+    static int ix, jx, kx;
+    extern /* Subroutine */ int xerbla_(char *, int *);
+    static int nounit;
+
+
+/*  Purpose   
+    =======   
+
+    STRSV  solves one of the systems of equations   
+
+       A*x = b,   or   A'*x = b,   
+
+    where b and x are n element vectors and A is an n by n unit, or   
+    non-unit, upper or lower triangular matrix.   
+
+    No test for singularity or near-singularity is included in this   
+    routine. Such tests must be performed before calling this routine.   
+
+    Parameters   
+    ==========   
+
+    UPLO   - CHARACTER*1.   
+             On entry, UPLO specifies whether the matrix is an upper or   
+             lower triangular matrix as follows:   
+
+                UPLO = 'U' or 'u'   A is an upper triangular matrix.   
+
+                UPLO = 'L' or 'l'   A is a lower triangular matrix.   
+
+             Unchanged on exit.   
+
+    TRANS  - CHARACTER*1.   
+             On entry, TRANS specifies the equations to be solved as   
+             follows:   
+
+                TRANS = 'N' or 'n'   A*x = b.   
+
+                TRANS = 'T' or 't'   A'*x = b.   
+
+                TRANS = 'C' or 'c'   A'*x = b.   
+
+             Unchanged on exit.   
+
+    DIAG   - CHARACTER*1.   
+             On entry, DIAG specifies whether or not A is unit   
+             triangular as follows:   
+
+                DIAG = 'U' or 'u'   A is assumed to be unit triangular.   
+
+                DIAG = 'N' or 'n'   A is not assumed to be unit   
+                                    triangular.   
+
+             Unchanged on exit.   
+
+    N      - INTEGER.   
+             On entry, N specifies the order of the matrix A.   
+             N must be at least zero.   
+             Unchanged on exit.   
+
+    A      - REAL             array of DIMENSION ( LDA, n ).   
+             Before entry with  UPLO = 'U' or 'u', the leading n by n   
+             upper triangular part of the array A must contain the upper 
+  
+             triangular matrix and the strictly lower triangular part of 
+  
+             A is not referenced.   
+             Before entry with UPLO = 'L' or 'l', the leading n by n   
+             lower triangular part of the array A must contain the lower 
+  
+             triangular matrix and the strictly upper triangular part of 
+  
+             A is not referenced.   
+             Note that when  DIAG = 'U' or 'u', the diagonal elements of 
+  
+             A are not referenced either, but are assumed to be unity.   
+             Unchanged on exit.   
+
+    LDA    - INTEGER.   
+             On entry, LDA specifies the first dimension of A as declared 
+  
+             in the calling (sub) program. LDA must be at least   
+             max( 1, n ).   
+             Unchanged on exit.   
+
+    X      - REAL             array of dimension at least   
+             ( 1 + ( n - 1 )*abs( INCX ) ).   
+             Before entry, the incremented array X must contain the n   
+             element right-hand side vector b. On exit, X is overwritten 
+  
+             with the solution vector x.   
+
+    INCX   - INTEGER.   
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+             Unchanged on exit.   
+
+
+    Level 2 Blas routine.   
+
+    -- Written on 22-October-1986.   
+       Jack Dongarra, Argonne National Lab.   
+       Jeremy Du Croz, Nag Central Office.   
+       Sven Hammarling, Nag Central Office.   
+       Richard Hanson, Sandia National Labs.   
+
+
+
+       Test the input parameters.   
+
+    
+   Parameter adjustments   
+       Function Body */
+#define X(I) x[(I)-1]
+
+#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
+
+    info = 0;
+    if (! lsame_(uplo, "U") && ! lsame_(uplo, "L")) {
+	info = 1;
+    } else if (! lsame_(trans, "N") && ! lsame_(trans, "T") &&
+	     ! lsame_(trans, "C")) {
+	info = 2;
+    } else if (! lsame_(diag, "U") && ! lsame_(diag, "N")) {
+	info = 3;
+    } else if (*n < 0) {
+	info = 4;
+    } else if (*lda < ((1 > *n)? 1: *n)) {
+	info = 6;
+    } else if (*incx == 0) {
+	info = 8;
+    }
+    if (info != 0) {
+	xerbla_("STRSV ", &info);
+	return 0;
+    }
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    }
+
+    nounit = lsame_(diag, "N");
+
+/*     Set up the start point in X if the increment is not unity. This   
+       will be  ( N - 1 )*INCX  too small for descending loops. */
+
+    if (*incx <= 0) {
+	kx = 1 - (*n - 1) * *incx;
+    } else if (*incx != 1) {
+	kx = 1;
+    }
+
+/*     Start the operations. In this version the elements of A are   
+       accessed sequentially with one pass through A. */
+
+    if (lsame_(trans, "N")) {
+
+/*        Form  x := inv( A )*x. */
+
+	if (lsame_(uplo, "U")) {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    if (X(j) != 0.f) {
+			if (nounit) {
+			    X(j) /= A(j,j);
+			}
+			temp = X(j);
+			for (i = j - 1; i >= 1; --i) {
+			    X(i) -= temp * A(i,j);
+/* L10: */
+			}
+		    }
+/* L20: */
+		}
+	    } else {
+		jx = kx + (*n - 1) * *incx;
+		for (j = *n; j >= 1; --j) {
+		    if (X(jx) != 0.f) {
+			if (nounit) {
+			    X(jx) /= A(j,j);
+			}
+			temp = X(jx);
+			ix = jx;
+			for (i = j - 1; i >= 1; --i) {
+			    ix -= *incx;
+			    X(ix) -= temp * A(i,j);
+/* L30: */
+			}
+		    }
+		    jx -= *incx;
+/* L40: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    if (X(j) != 0.f) {
+			if (nounit) {
+			    X(j) /= A(j,j);
+			}
+			temp = X(j);
+			i__2 = *n;
+			for (i = j + 1; i <= *n; ++i) {
+			    X(i) -= temp * A(i,j);
+/* L50: */
+			}
+		    }
+/* L60: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    if (X(jx) != 0.f) {
+			if (nounit) {
+			    X(jx) /= A(j,j);
+			}
+			temp = X(jx);
+			ix = jx;
+			i__2 = *n;
+			for (i = j + 1; i <= *n; ++i) {
+			    ix += *incx;
+			    X(ix) -= temp * A(i,j);
+/* L70: */
+			}
+		    }
+		    jx += *incx;
+/* L80: */
+		}
+	    }
+	}
+    } else {
+
+/*        Form  x := inv( A' )*x. */
+
+	if (lsame_(uplo, "U")) {
+	    if (*incx == 1) {
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    temp = X(j);
+		    i__2 = j - 1;
+		    for (i = 1; i <= j-1; ++i) {
+			temp -= A(i,j) * X(i);
+/* L90: */
+		    }
+		    if (nounit) {
+			temp /= A(j,j);
+		    }
+		    X(j) = temp;
+/* L100: */
+		}
+	    } else {
+		jx = kx;
+		i__1 = *n;
+		for (j = 1; j <= *n; ++j) {
+		    temp = X(jx);
+		    ix = kx;
+		    i__2 = j - 1;
+		    for (i = 1; i <= j-1; ++i) {
+			temp -= A(i,j) * X(ix);
+			ix += *incx;
+/* L110: */
+		    }
+		    if (nounit) {
+			temp /= A(j,j);
+		    }
+		    X(jx) = temp;
+		    jx += *incx;
+/* L120: */
+		}
+	    }
+	} else {
+	    if (*incx == 1) {
+		for (j = *n; j >= 1; --j) {
+		    temp = X(j);
+		    i__1 = j + 1;
+		    for (i = *n; i >= j+1; --i) {
+			temp -= A(i,j) * X(i);
+/* L130: */
+		    }
+		    if (nounit) {
+			temp /= A(j,j);
+		    }
+		    X(j) = temp;
+/* L140: */
+		}
+	    } else {
+		kx += (*n - 1) * *incx;
+		jx = kx;
+		for (j = *n; j >= 1; --j) {
+		    temp = X(jx);
+		    ix = kx;
+		    i__1 = j + 1;
+		    for (i = *n; i >= j+1; --i) {
+			temp -= A(i,j) * X(ix);
+			ix -= *incx;
+/* L150: */
+		    }
+		    if (nounit) {
+			temp /= A(j,j);
+		    }
+		    X(jx) = temp;
+		    jx -= *incx;
+/* L160: */
+		}
+	    }
+	}
+    }
+
+    return 0;
+
+/*     End of STRSV . */
+
+} /* strsv_ */
+
diff --git a/intern/opennl/superlu/superlu_timer.c b/intern/opennl/superlu/superlu_timer.c
new file mode 100644
index 00000000000..798fd59d4ea
--- /dev/null
+++ b/intern/opennl/superlu/superlu_timer.c
@@ -0,0 +1,55 @@
+/* 
+ * Purpose
+ * ======= 
+ *	Returns the time in seconds used by the process.
+ *
+ * Note: the timer function call is machine dependent. Use conditional
+ *       compilation to choose the appropriate function.
+ *
+ */
+
+/* We want this flag, safer than putting in build system */
+#define NO_TIMER
+
+#ifdef SUN 
+/*
+ * 	It uses the system call gethrtime(3C), which is accurate to 
+ *	nanoseconds. 
+*/
+#include <sys/time.h>
+ 
+double SuperLU_timer_() {
+    return ( (double)gethrtime() / 1e9 );
+}
+
+#else
+
+#ifndef NO_TIMER
+#include <sys/types.h>
+#include <sys/times.h>
+#include <time.h>
+#include <sys/time.h>
+#endif
+
+#ifndef CLK_TCK
+#define CLK_TCK 60
+#endif
+
+double SuperLU_timer_()
+{
+#ifdef NO_TIMER
+    /* no sys/times.h on WIN32 */
+    double tmp;
+    tmp = 0.0;
+#else
+    struct tms use;
+    double tmp;
+    times(&use);
+    tmp = use.tms_utime;
+    tmp += use.tms_stime;
+#endif
+    return (double)(tmp) / CLK_TCK;
+}
+
+#endif
+
diff --git a/intern/opennl/superlu/supermatrix.h b/intern/opennl/superlu/supermatrix.h
new file mode 100644
index 00000000000..665e22dc91f
--- /dev/null
+++ b/intern/opennl/superlu/supermatrix.h
@@ -0,0 +1,140 @@
+#ifndef __SUPERLU_SUPERMATRIX /* allow multiple inclusions */
+#define __SUPERLU_SUPERMATRIX
+
+/********************************************
+ * The matrix types are defined as follows. *
+ ********************************************/
+typedef enum {
+    SLU_NC,    /* column-wise, no supernode */
+    SLU_NR,    /* row-wize, no supernode */
+    SLU_SC,    /* column-wise, supernode */
+    SLU_SR,    /* row-wise, supernode */
+    SLU_NCP,   /* column-wise, column-permuted, no supernode 
+                  (The consecutive columns of nonzeros, after permutation,
+		   may not be stored  contiguously.) */
+    SLU_DN     /* Fortran style column-wise storage for dense matrix */
+} Stype_t;
+
+typedef enum {
+    SLU_S,     /* single */
+    SLU_D,     /* double */
+    SLU_C,     /* single complex */
+    SLU_Z      /* double complex */
+} Dtype_t;
+
+typedef enum {
+    SLU_GE,    /* general */
+    SLU_TRLU,  /* lower triangular, unit diagonal */
+    SLU_TRUU,  /* upper triangular, unit diagonal */
+    SLU_TRL,   /* lower triangular */
+    SLU_TRU,   /* upper triangular */
+    SLU_SYL,   /* symmetric, store lower half */
+    SLU_SYU,   /* symmetric, store upper half */
+    SLU_HEL,   /* Hermitian, store lower half */
+    SLU_HEU    /* Hermitian, store upper half */
+} Mtype_t;
+
+typedef struct {
+	Stype_t Stype; /* Storage type: interprets the storage structure 
+		   	  pointed to by *Store. */
+	Dtype_t Dtype; /* Data type. */
+	Mtype_t Mtype; /* Matrix type: describes the mathematical property of 
+			  the matrix. */
+	int_t  nrow;   /* number of rows */
+	int_t  ncol;   /* number of columns */
+	void *Store;   /* pointer to the actual storage of the matrix */
+} SuperMatrix;
+
+/***********************************************
+ * The storage schemes are defined as follows. *
+ ***********************************************/
+
+/* Stype == NC (Also known as Harwell-Boeing sparse matrix format) */
+typedef struct {
+    int_t  nnz;	    /* number of nonzeros in the matrix */
+    void   *nzval;  /* pointer to array of nonzero values, packed by column */
+    int_t  *rowind; /* pointer to array of row indices of the nonzeros */
+    int_t  *colptr; /* pointer to array of beginning of columns in nzval[] 
+		       and rowind[]  */
+                    /* Note:
+		       Zero-based indexing is used;
+		       colptr[] has ncol+1 entries, the last one pointing
+		       beyond the last column, so that colptr[ncol] = nnz. */
+} NCformat;
+
+/* Stype == NR (Also known as row compressed storage (RCS). */
+typedef struct {
+    int_t nnz;	   /* number of nonzeros in the matrix */
+    void  *nzval;  /* pointer to array of nonzero values, packed by row */
+    int_t *colind; /* pointer to array of column indices of the nonzeros */
+    int_t *rowptr; /* pointer to array of beginning of rows in nzval[] 
+                      and colind[]  */
+                   /* Note:
+		      Zero-based indexing is used;
+		      nzval[] and colind[] are of the same length, nnz;
+		      rowptr[] has nrow+1 entries, the last one pointing
+		      beyond the last column, so that rowptr[nrow] = nnz. */
+} NRformat;
+
+/* Stype == SC */
+typedef struct {
+  int_t  nnz;	     /* number of nonzeros in the matrix */
+  int_t  nsuper;     /* number of supernodes, minus 1 */
+  void *nzval;       /* pointer to array of nonzero values, packed by column */
+  int_t *nzval_colptr;/* pointer to array of beginning of columns in nzval[] */
+  int_t *rowind;     /* pointer to array of compressed row indices of 
+			rectangular supernodes */
+  int_t *rowind_colptr;/* pointer to array of beginning of columns in rowind[] */
+  int_t *col_to_sup; /* col_to_sup[j] is the supernode number to which column 
+			j belongs; mapping from column to supernode number. */
+  int_t *sup_to_col; /* sup_to_col[s] points to the start of the s-th 
+			supernode; mapping from supernode number to column.
+		        e.g.: col_to_sup: 0 1 2 2 3 3 3 4 4 4 4 4 4 (ncol=12)
+		              sup_to_col: 0 1 2 4 7 12           (nsuper=4) */
+                     /* Note:
+		        Zero-based indexing is used;
+		        nzval_colptr[], rowind_colptr[], col_to_sup and
+		        sup_to_col[] have ncol+1 entries, the last one
+		        pointing beyond the last column.
+		        For col_to_sup[], only the first ncol entries are
+		        defined. For sup_to_col[], only the first nsuper+2
+		        entries are defined. */
+} SCformat;
+
+/* Stype == NCP */
+typedef struct {
+    int_t nnz;	  /* number of nonzeros in the matrix */
+    void *nzval;  /* pointer to array of nonzero values, packed by column */
+    int_t *rowind;/* pointer to array of row indices of the nonzeros */
+		  /* Note: nzval[]/rowind[] always have the same length */
+    int_t *colbeg;/* colbeg[j] points to the beginning of column j in nzval[] 
+                     and rowind[]  */
+    int_t *colend;/* colend[j] points to one past the last element of column
+		     j in nzval[] and rowind[]  */
+		  /* Note:
+		     Zero-based indexing is used;
+		     The consecutive columns of the nonzeros may not be 
+		     contiguous in storage, because the matrix has been 
+		     postmultiplied by a column permutation matrix. */
+} NCPformat;
+
+/* Stype == DN */
+typedef struct {
+    int_t lda;    /* leading dimension */
+    void *nzval;  /* array of size lda*ncol to represent a dense matrix */
+} DNformat;
+
+
+
+/*********************************************************
+ * Macros used for easy access of sparse matrix entries. *
+ *********************************************************/
+#define L_SUB_START(col)     ( Lstore->rowind_colptr[col] )
+#define L_SUB(ptr)           ( Lstore->rowind[ptr] )
+#define L_NZ_START(col)      ( Lstore->nzval_colptr[col] )
+#define L_FST_SUPC(superno)  ( Lstore->sup_to_col[superno] )
+#define U_NZ_START(col)      ( Ustore->colptr[col] )
+#define U_SUB(ptr)           ( Ustore->rowind[ptr] )
+
+
+#endif  /* __SUPERLU_SUPERMATRIX */
diff --git a/intern/opennl/superlu/sutil.c b/intern/opennl/superlu/sutil.c
new file mode 100644
index 00000000000..4689f34968a
--- /dev/null
+++ b/intern/opennl/superlu/sutil.c
@@ -0,0 +1,478 @@
+
+/*
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ */
+/*
+  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ 
+  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ 
+  Permission is hereby granted to use or copy this program for any
+  purpose, provided the above notices are retained on all copies.
+  Permission to modify the code and to distribute modified code is
+  granted, provided the above notices are retained, and a notice that
+  the code was modified is included with the above copyright notice.
+*/
+
+#include <math.h>
+#include "ssp_defs.h"
+
+void
+sCreate_CompCol_Matrix(SuperMatrix *A, int m, int n, int nnz, 
+		       float *nzval, int *rowind, int *colptr,
+		       Stype_t stype, Dtype_t dtype, Mtype_t mtype)
+{
+    NCformat *Astore;
+
+    A->Stype = stype;
+    A->Dtype = dtype;
+    A->Mtype = mtype;
+    A->nrow = m;
+    A->ncol = n;
+    A->Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) );
+    if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store");
+    Astore = A->Store;
+    Astore->nnz = nnz;
+    Astore->nzval = nzval;
+    Astore->rowind = rowind;
+    Astore->colptr = colptr;
+}
+
+void
+sCreate_CompRow_Matrix(SuperMatrix *A, int m, int n, int nnz, 
+		       float *nzval, int *colind, int *rowptr,
+		       Stype_t stype, Dtype_t dtype, Mtype_t mtype)
+{
+    NRformat *Astore;
+
+    A->Stype = stype;
+    A->Dtype = dtype;
+    A->Mtype = mtype;
+    A->nrow = m;
+    A->ncol = n;
+    A->Store = (void *) SUPERLU_MALLOC( sizeof(NRformat) );
+    if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store");
+    Astore = A->Store;
+    Astore->nnz = nnz;
+    Astore->nzval = nzval;
+    Astore->colind = colind;
+    Astore->rowptr = rowptr;
+}
+
+/* Copy matrix A into matrix B. */
+void
+sCopy_CompCol_Matrix(SuperMatrix *A, SuperMatrix *B)
+{
+    NCformat *Astore, *Bstore;
+    int      ncol, nnz, i;
+
+    B->Stype = A->Stype;
+    B->Dtype = A->Dtype;
+    B->Mtype = A->Mtype;
+    B->nrow  = A->nrow;;
+    B->ncol  = ncol = A->ncol;
+    Astore   = (NCformat *) A->Store;
+    Bstore   = (NCformat *) B->Store;
+    Bstore->nnz = nnz = Astore->nnz;
+    for (i = 0; i < nnz; ++i)
+	((float *)Bstore->nzval)[i] = ((float *)Astore->nzval)[i];
+    for (i = 0; i < nnz; ++i) Bstore->rowind[i] = Astore->rowind[i];
+    for (i = 0; i <= ncol; ++i) Bstore->colptr[i] = Astore->colptr[i];
+}
+
+
+void
+sCreate_Dense_Matrix(SuperMatrix *X, int m, int n, float *x, int ldx,
+		    Stype_t stype, Dtype_t dtype, Mtype_t mtype)
+{
+    DNformat    *Xstore;
+    
+    X->Stype = stype;
+    X->Dtype = dtype;
+    X->Mtype = mtype;
+    X->nrow = m;
+    X->ncol = n;
+    X->Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) );
+    if ( !(X->Store) ) ABORT("SUPERLU_MALLOC fails for X->Store");
+    Xstore = (DNformat *) X->Store;
+    Xstore->lda = ldx;
+    Xstore->nzval = (float *) x;
+}
+
+void
+sCopy_Dense_Matrix(int M, int N, float *X, int ldx,
+			float *Y, int ldy)
+{
+/*
+ *
+ *  Purpose
+ *  =======
+ *
+ *  Copies a two-dimensional matrix X to another matrix Y.
+ */
+    int    i, j;
+    
+    for (j = 0; j < N; ++j)
+        for (i = 0; i < M; ++i)
+            Y[i + j*ldy] = X[i + j*ldx];
+}
+
+void
+sCreate_SuperNode_Matrix(SuperMatrix *L, int m, int n, int nnz, 
+			float *nzval, int *nzval_colptr, int *rowind,
+			int *rowind_colptr, int *col_to_sup, int *sup_to_col,
+			Stype_t stype, Dtype_t dtype, Mtype_t mtype)
+{
+    SCformat *Lstore;
+
+    L->Stype = stype;
+    L->Dtype = dtype;
+    L->Mtype = mtype;
+    L->nrow = m;
+    L->ncol = n;
+    L->Store = (void *) SUPERLU_MALLOC( sizeof(SCformat) );
+    if ( !(L->Store) ) ABORT("SUPERLU_MALLOC fails for L->Store");
+    Lstore = L->Store;
+    Lstore->nnz = nnz;
+    Lstore->nsuper = col_to_sup[n];
+    Lstore->nzval = nzval;
+    Lstore->nzval_colptr = nzval_colptr;
+    Lstore->rowind = rowind;
+    Lstore->rowind_colptr = rowind_colptr;
+    Lstore->col_to_sup = col_to_sup;
+    Lstore->sup_to_col = sup_to_col;
+
+}
+
+
+/*
+ * Convert a row compressed storage into a column compressed storage.
+ */
+void
+sCompRow_to_CompCol(int m, int n, int nnz, 
+		    float *a, int *colind, int *rowptr,
+		    float **at, int **rowind, int **colptr)
+{
+    register int i, j, col, relpos;
+    int *marker;
+
+    /* Allocate storage for another copy of the matrix. */
+    *at = (float *) floatMalloc(nnz);
+    *rowind = (int *) intMalloc(nnz);
+    *colptr = (int *) intMalloc(n+1);
+    marker = (int *) intCalloc(n);
+    
+    /* Get counts of each column of A, and set up column pointers */
+    for (i = 0; i < m; ++i)
+	for (j = rowptr[i]; j < rowptr[i+1]; ++j) ++marker[colind[j]];
+    (*colptr)[0] = 0;
+    for (j = 0; j < n; ++j) {
+	(*colptr)[j+1] = (*colptr)[j] + marker[j];
+	marker[j] = (*colptr)[j];
+    }
+
+    /* Transfer the matrix into the compressed column storage. */
+    for (i = 0; i < m; ++i) {
+	for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
+	    col = colind[j];
+	    relpos = marker[col];
+	    (*rowind)[relpos] = i;
+	    (*at)[relpos] = a[j];
+	    ++marker[col];
+	}
+    }
+
+    SUPERLU_FREE(marker);
+}
+
+
+void
+sPrint_CompCol_Matrix(char *what, SuperMatrix *A)
+{
+    NCformat     *Astore;
+    register int i,n;
+    float       *dp;
+    
+    printf("\nCompCol matrix %s:\n", what);
+    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
+    n = A->ncol;
+    Astore = (NCformat *) A->Store;
+    dp = (float *) Astore->nzval;
+    printf("nrow %d, ncol %d, nnz %d\n", A->nrow,A->ncol,Astore->nnz);
+    printf("nzval: ");
+    for (i = 0; i < Astore->colptr[n]; ++i) printf("%f  ", dp[i]);
+    printf("\nrowind: ");
+    for (i = 0; i < Astore->colptr[n]; ++i) printf("%d  ", Astore->rowind[i]);
+    printf("\ncolptr: ");
+    for (i = 0; i <= n; ++i) printf("%d  ", Astore->colptr[i]);
+    printf("\n");
+    fflush(stdout);
+}
+
+void
+sPrint_SuperNode_Matrix(char *what, SuperMatrix *A)
+{
+    SCformat     *Astore;
+    register int i, j, k, c, d, n, nsup;
+    float       *dp;
+    int *col_to_sup, *sup_to_col, *rowind, *rowind_colptr;
+    
+    printf("\nSuperNode matrix %s:\n", what);
+    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
+    n = A->ncol;
+    Astore = (SCformat *) A->Store;
+    dp = (float *) Astore->nzval;
+    col_to_sup = Astore->col_to_sup;
+    sup_to_col = Astore->sup_to_col;
+    rowind_colptr = Astore->rowind_colptr;
+    rowind = Astore->rowind;
+    printf("nrow %d, ncol %d, nnz %d, nsuper %d\n", 
+	   A->nrow,A->ncol,Astore->nnz,Astore->nsuper);
+    printf("nzval:\n");
+    for (k = 0; k <= Astore->nsuper; ++k) {
+      c = sup_to_col[k];
+      nsup = sup_to_col[k+1] - c;
+      for (j = c; j < c + nsup; ++j) {
+	d = Astore->nzval_colptr[j];
+	for (i = rowind_colptr[c]; i < rowind_colptr[c+1]; ++i) {
+	  printf("%d\t%d\t%e\n", rowind[i], j, dp[d++]);
+	}
+      }
+    }
+#if 0
+    for (i = 0; i < Astore->nzval_colptr[n]; ++i) printf("%f  ", dp[i]);
+#endif
+    printf("\nnzval_colptr: ");
+    for (i = 0; i <= n; ++i) printf("%d  ", Astore->nzval_colptr[i]);
+    printf("\nrowind: ");
+    for (i = 0; i < Astore->rowind_colptr[n]; ++i) 
+        printf("%d  ", Astore->rowind[i]);
+    printf("\nrowind_colptr: ");
+    for (i = 0; i <= n; ++i) printf("%d  ", Astore->rowind_colptr[i]);
+    printf("\ncol_to_sup: ");
+    for (i = 0; i < n; ++i) printf("%d  ", col_to_sup[i]);
+    printf("\nsup_to_col: ");
+    for (i = 0; i <= Astore->nsuper+1; ++i) 
+        printf("%d  ", sup_to_col[i]);
+    printf("\n");
+    fflush(stdout);
+}
+
+void
+sPrint_Dense_Matrix(char *what, SuperMatrix *A)
+{
+    DNformat     *Astore;
+    register int i;
+    float       *dp;
+    
+    printf("\nDense matrix %s:\n", what);
+    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
+    Astore = (DNformat *) A->Store;
+    dp = (float *) Astore->nzval;
+    printf("nrow %d, ncol %d, lda %d\n", A->nrow,A->ncol,Astore->lda);
+    printf("\nnzval: ");
+    for (i = 0; i < A->nrow; ++i) printf("%f  ", dp[i]);
+    printf("\n");
+    fflush(stdout);
+}
+
+/*
+ * Diagnostic print of column "jcol" in the U/L factor.
+ */
+void
+sprint_lu_col(char *msg, int jcol, int pivrow, int *xprune, GlobalLU_t *Glu)
+{
+    int     i, k, fsupc;
+    int     *xsup, *supno;
+    int     *xlsub, *lsub;
+    float  *lusup;
+    int     *xlusup;
+    float  *ucol;
+    int     *usub, *xusub;
+
+    xsup    = Glu->xsup;
+    supno   = Glu->supno;
+    lsub    = Glu->lsub;
+    xlsub   = Glu->xlsub;
+    lusup   = Glu->lusup;
+    xlusup  = Glu->xlusup;
+    ucol    = Glu->ucol;
+    usub    = Glu->usub;
+    xusub   = Glu->xusub;
+    
+    printf("%s", msg);
+    printf("col %d: pivrow %d, supno %d, xprune %d\n", 
+	   jcol, pivrow, supno[jcol], xprune[jcol]);
+    
+    printf("\tU-col:\n");
+    for (i = xusub[jcol]; i < xusub[jcol+1]; i++)
+	printf("\t%d%10.4f\n", usub[i], ucol[i]);
+    printf("\tL-col in rectangular snode:\n");
+    fsupc = xsup[supno[jcol]];	/* first col of the snode */
+    i = xlsub[fsupc];
+    k = xlusup[jcol];
+    while ( i < xlsub[fsupc+1] && k < xlusup[jcol+1] ) {
+	printf("\t%d\t%10.4f\n", lsub[i], lusup[k]);
+	i++; k++;
+    }
+    fflush(stdout);
+}
+
+
+/*
+ * Check whether tempv[] == 0. This should be true before and after 
+ * calling any numeric routines, i.e., "panel_bmod" and "column_bmod". 
+ */
+void scheck_tempv(int n, float *tempv)
+{
+    int i;
+	
+    for (i = 0; i < n; i++) {
+	if (tempv[i] != 0.0) 
+	{
+	    fprintf(stderr,"tempv[%d] = %f\n", i,tempv[i]);
+	    ABORT("scheck_tempv");
+	}
+    }
+}
+
+
+void
+sGenXtrue(int n, int nrhs, float *x, int ldx)
+{
+    int  i, j;
+    for (j = 0; j < nrhs; ++j)
+	for (i = 0; i < n; ++i) {
+	    x[i + j*ldx] = 1.0;/* + (float)(i+1.)/n;*/
+	}
+}
+
+/*
+ * Let rhs[i] = sum of i-th row of A, so the solution vector is all 1's
+ */
+void
+sFillRHS(trans_t trans, int nrhs, float *x, int ldx,
+         SuperMatrix *A, SuperMatrix *B)
+{
+    NCformat *Astore;
+    float   *Aval;
+    DNformat *Bstore;
+    float   *rhs;
+    float one = 1.0;
+    float zero = 0.0;
+    int      ldc;
+    char transc[1];
+
+    Astore = A->Store;
+    Aval   = (float *) Astore->nzval;
+    Bstore = B->Store;
+    rhs    = Bstore->nzval;
+    ldc    = Bstore->lda;
+    
+    if ( trans == NOTRANS ) *(unsigned char *)transc = 'N';
+    else *(unsigned char *)transc = 'T';
+
+    sp_sgemm(transc, nrhs, one, A,
+	     x, ldx, zero, rhs, ldc);
+
+}
+
+/* 
+ * Fills a float precision array with a given value.
+ */
+void 
+sfill(float *a, int alen, float dval)
+{
+    register int i;
+    for (i = 0; i < alen; i++) a[i] = dval;
+}
+
+
+
+/* 
+ * Check the inf-norm of the error vector 
+ */
+void sinf_norm_error(int nrhs, SuperMatrix *X, float *xtrue)
+{
+    DNformat *Xstore;
+    float err, xnorm;
+    float *Xmat, *soln_work;
+    int i, j;
+
+    Xstore = X->Store;
+    Xmat = Xstore->nzval;
+
+    for (j = 0; j < nrhs; j++) {
+      soln_work = &Xmat[j*Xstore->lda];
+      err = xnorm = 0.0;
+      for (i = 0; i < X->nrow; i++) {
+	err = SUPERLU_MAX(err, fabs(soln_work[i] - xtrue[i]));
+	xnorm = SUPERLU_MAX(xnorm, fabs(soln_work[i]));
+      }
+      err = err / xnorm;
+      printf("||X - Xtrue||/||X|| = %e\n", err);
+    }
+}
+
+
+
+/* Print performance of the code. */
+void
+sPrintPerf(SuperMatrix *L, SuperMatrix *U, mem_usage_t *mem_usage,
+           float rpg, float rcond, float *ferr,
+           float *berr, char *equed, SuperLUStat_t *stat)
+{
+    SCformat *Lstore;
+    NCformat *Ustore;
+    double   *utime;
+    flops_t  *ops;
+    
+    utime = stat->utime;
+    ops   = stat->ops;
+    
+    if ( utime[FACT] != 0. )
+	printf("Factor flops = %e\tMflops = %8.2f\n", ops[FACT],
+	       ops[FACT]*1e-6/utime[FACT]);
+    printf("Identify relaxed snodes	= %8.2f\n", utime[RELAX]);
+    if ( utime[SOLVE] != 0. )
+	printf("Solve flops = %.0f, Mflops = %8.2f\n", ops[SOLVE],
+	       ops[SOLVE]*1e-6/utime[SOLVE]);
+    
+    Lstore = (SCformat *) L->Store;
+    Ustore = (NCformat *) U->Store;
+    printf("\tNo of nonzeros in factor L = %d\n", Lstore->nnz);
+    printf("\tNo of nonzeros in factor U = %d\n", Ustore->nnz);
+    printf("\tNo of nonzeros in L+U = %d\n", Lstore->nnz + Ustore->nnz);
+	
+    printf("L\\U MB %.3f\ttotal MB needed %.3f\texpansions %d\n",
+	   mem_usage->for_lu/1e6, mem_usage->total_needed/1e6,
+	   mem_usage->expansions);
+	
+    printf("\tFactor\tMflops\tSolve\tMflops\tEtree\tEquil\tRcond\tRefine\n");
+    printf("PERF:%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f%8.2f\n",
+	   utime[FACT], ops[FACT]*1e-6/utime[FACT],
+	   utime[SOLVE], ops[SOLVE]*1e-6/utime[SOLVE],
+	   utime[ETREE], utime[EQUIL], utime[RCOND], utime[REFINE]);
+    
+    printf("\tRpg\t\tRcond\t\tFerr\t\tBerr\t\tEquil?\n");
+    printf("NUM:\t%e\t%e\t%e\t%e\t%s\n",
+	   rpg, rcond, ferr[0], berr[0], equed);
+    
+}
+
+
+
+
+int print_float_vec(char *what, int n, float *vec)
+{
+    int i;
+    printf("%s: n %d\n", what, n);
+    for (i = 0; i < n; ++i) printf("%d\t%f\n", i, vec[i]);
+    return 0;
+}
+
diff --git a/intern/opennl/superlu/util.c b/intern/opennl/superlu/util.c
new file mode 100644
index 00000000000..3c49d714d1c
--- /dev/null
+++ b/intern/opennl/superlu/util.c
@@ -0,0 +1,391 @@
+/*
+ * -- SuperLU routine (version 3.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ */
+/*
+  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
+ 
+  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
+  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ 
+  Permission is hereby granted to use or copy this program for any
+  purpose, provided the above notices are retained on all copies.
+  Permission to modify the code and to distribute modified code is
+  granted, provided the above notices are retained, and a notice that
+  the code was modified is included with the above copyright notice.
+*/
+
+#include <math.h>
+#include "ssp_defs.h"
+#include "util.h"
+
+/* 
+ * Global statistics variale
+ */
+
+void superlu_abort_and_exit(char* msg)
+{
+    fprintf(stderr, msg);
+    exit (-1);
+}
+
+/*
+ * Set the default values for the options argument.
+ */
+void set_default_options(superlu_options_t *options)
+{
+    options->Fact = DOFACT;
+    options->Equil = YES;
+    options->ColPerm = COLAMD;
+    options->DiagPivotThresh = 1.0;
+    options->Trans = NOTRANS;
+    options->IterRefine = NOREFINE;
+    options->SymmetricMode = NO;
+    options->PivotGrowth = NO;
+    options->ConditionNumber = NO;
+    options->PrintStat = YES;
+}
+
+/* Deallocate the structure pointing to the actual storage of the matrix. */
+void
+Destroy_SuperMatrix_Store(SuperMatrix *A)
+{
+    SUPERLU_FREE ( A->Store );
+}
+
+void
+Destroy_CompCol_Matrix(SuperMatrix *A)
+{
+    SUPERLU_FREE( ((NCformat *)A->Store)->rowind );
+    SUPERLU_FREE( ((NCformat *)A->Store)->colptr );
+    SUPERLU_FREE( ((NCformat *)A->Store)->nzval );
+    SUPERLU_FREE( A->Store );
+}
+
+void
+Destroy_CompRow_Matrix(SuperMatrix *A)
+{
+    SUPERLU_FREE( ((NRformat *)A->Store)->colind );
+    SUPERLU_FREE( ((NRformat *)A->Store)->rowptr );
+    SUPERLU_FREE( ((NRformat *)A->Store)->nzval );
+    SUPERLU_FREE( A->Store );
+}
+
+void
+Destroy_SuperNode_Matrix(SuperMatrix *A)
+{
+    SUPERLU_FREE ( ((SCformat *)A->Store)->rowind );
+    SUPERLU_FREE ( ((SCformat *)A->Store)->rowind_colptr );
+    SUPERLU_FREE ( ((SCformat *)A->Store)->nzval );
+    SUPERLU_FREE ( ((SCformat *)A->Store)->nzval_colptr );
+    SUPERLU_FREE ( ((SCformat *)A->Store)->col_to_sup );
+    SUPERLU_FREE ( ((SCformat *)A->Store)->sup_to_col );
+    SUPERLU_FREE ( A->Store );
+}
+
+/* A is of type Stype==NCP */
+void
+Destroy_CompCol_Permuted(SuperMatrix *A)
+{
+    SUPERLU_FREE ( ((NCPformat *)A->Store)->colbeg );
+    SUPERLU_FREE ( ((NCPformat *)A->Store)->colend );
+    SUPERLU_FREE ( A->Store );
+}
+
+/* A is of type Stype==DN */
+void
+Destroy_Dense_Matrix(SuperMatrix *A)
+{
+    DNformat* Astore = A->Store;
+    SUPERLU_FREE (Astore->nzval);
+    SUPERLU_FREE ( A->Store );
+}
+
+/*
+ * Reset repfnz[] for the current column 
+ */
+void
+resetrep_col (const int nseg, const int *segrep, int *repfnz)
+{
+    int i, irep;
+    
+    for (i = 0; i < nseg; i++) {
+	irep = segrep[i];
+	repfnz[irep] = EMPTY;
+    }
+}
+
+
+/*
+ * Count the total number of nonzeros in factors L and U,  and in the 
+ * symmetrically reduced L. 
+ */
+void
+countnz(const int n, int *xprune, int *nnzL, int *nnzU, GlobalLU_t *Glu)
+{
+    int          nsuper, fsupc, i, j;
+    int          nnzL0, jlen, irep;
+    int          *xsup, *xlsub;
+
+    xsup   = Glu->xsup;
+    xlsub  = Glu->xlsub;
+    *nnzL  = 0;
+    *nnzU  = (Glu->xusub)[n];
+    nnzL0  = 0;
+    nsuper = (Glu->supno)[n];
+
+    if ( n <= 0 ) return;
+
+    /* 
+     * For each supernode
+     */
+    for (i = 0; i <= nsuper; i++) {
+	fsupc = xsup[i];
+	jlen = xlsub[fsupc+1] - xlsub[fsupc];
+
+	for (j = fsupc; j < xsup[i+1]; j++) {
+	    *nnzL += jlen;
+	    *nnzU += j - fsupc + 1;
+	    jlen--;
+	}
+	irep = xsup[i+1] - 1;
+	nnzL0 += xprune[irep] - xlsub[irep];
+    }
+    
+    /* printf("\tNo of nonzeros in symm-reduced L = %d\n", nnzL0);*/
+}
+
+
+
+/*
+ * Fix up the data storage lsub for L-subscripts. It removes the subscript
+ * sets for structural pruning,	and applies permuation to the remaining
+ * subscripts.
+ */
+void
+fixupL(const int n, const int *perm_r, GlobalLU_t *Glu)
+{
+    register int nsuper, fsupc, nextl, i, j, k, jstrt;
+    int          *xsup, *lsub, *xlsub;
+
+    if ( n <= 1 ) return;
+
+    xsup   = Glu->xsup;
+    lsub   = Glu->lsub;
+    xlsub  = Glu->xlsub;
+    nextl  = 0;
+    nsuper = (Glu->supno)[n];
+    
+    /* 
+     * For each supernode ...
+     */
+    for (i = 0; i <= nsuper; i++) {
+	fsupc = xsup[i];
+	jstrt = xlsub[fsupc];
+	xlsub[fsupc] = nextl;
+	for (j = jstrt; j < xlsub[fsupc+1]; j++) {
+	    lsub[nextl] = perm_r[lsub[j]]; /* Now indexed into P*A */
+	    nextl++;
+  	}
+	for (k = fsupc+1; k < xsup[i+1]; k++) 
+	    	xlsub[k] = nextl;	/* Other columns in supernode i */
+
+    }
+
+    xlsub[n] = nextl;
+}
+
+
+/*
+ * Diagnostic print of segment info after panel_dfs().
+ */
+void print_panel_seg(int n, int w, int jcol, int nseg, 
+		     int *segrep, int *repfnz)
+{
+    int j, k;
+    
+    for (j = jcol; j < jcol+w; j++) {
+	printf("\tcol %d:\n", j);
+	for (k = 0; k < nseg; k++)
+	    printf("\t\tseg %d, segrep %d, repfnz %d\n", k, 
+			segrep[k], repfnz[(j-jcol)*n + segrep[k]]);
+    }
+
+}
+
+
+void
+StatInit(SuperLUStat_t *stat)
+{
+    register int i, w, panel_size, relax;
+
+    panel_size = sp_ienv(1);
+    relax = sp_ienv(2);
+    w = SUPERLU_MAX(panel_size, relax);
+    stat->panel_histo = intCalloc(w+1);
+    stat->utime = (double *) SUPERLU_MALLOC(NPHASES * sizeof(double));
+    if (!stat->utime) ABORT("SUPERLU_MALLOC fails for stat->utime");
+    stat->ops = (flops_t *) SUPERLU_MALLOC(NPHASES * sizeof(flops_t));
+    if (!stat->ops) ABORT("SUPERLU_MALLOC fails for stat->ops");
+    for (i = 0; i < NPHASES; ++i) {
+        stat->utime[i] = 0.;
+        stat->ops[i] = 0.;
+    }
+}
+
+
+void
+StatPrint(SuperLUStat_t *stat)
+{
+    double         *utime;
+    flops_t        *ops;
+
+    utime = stat->utime;
+    ops   = stat->ops;
+    printf("Factor time  = %8.2f\n", utime[FACT]);
+    if ( utime[FACT] != 0.0 )
+      printf("Factor flops = %e\tMflops = %8.2f\n", ops[FACT],
+	     ops[FACT]*1e-6/utime[FACT]);
+
+    printf("Solve time   = %8.2f\n", utime[SOLVE]);
+    if ( utime[SOLVE] != 0.0 )
+      printf("Solve flops = %e\tMflops = %8.2f\n", ops[SOLVE],
+	     ops[SOLVE]*1e-6/utime[SOLVE]);
+
+}
+
+
+void
+StatFree(SuperLUStat_t *stat)
+{
+    SUPERLU_FREE(stat->panel_histo);
+    SUPERLU_FREE(stat->utime);
+    SUPERLU_FREE(stat->ops);
+}
+
+
+flops_t
+LUFactFlops(SuperLUStat_t *stat)
+{
+    return (stat->ops[FACT]);
+}
+
+flops_t
+LUSolveFlops(SuperLUStat_t *stat)
+{
+    return (stat->ops[SOLVE]);
+}
+
+
+
+
+
+/* 
+ * Fills an integer array with a given value.
+ */
+void ifill(int *a, int alen, int ival)
+{
+    register int i;
+    for (i = 0; i < alen; i++) a[i] = ival;
+}
+
+
+
+/* 
+ * Get the statistics of the supernodes 
+ */
+#define NBUCKS 10
+static 	int	max_sup_size;
+
+void super_stats(int nsuper, int *xsup)
+{
+    register int nsup1 = 0;
+    int          i, isize, whichb, bl, bh;
+    int          bucket[NBUCKS];
+
+    max_sup_size = 0;
+
+    for (i = 0; i <= nsuper; i++) {
+	isize = xsup[i+1] - xsup[i];
+	if ( isize == 1 ) nsup1++;
+	if ( max_sup_size < isize ) max_sup_size = isize;	
+    }
+
+    printf("    Supernode statistics:\n\tno of super = %d\n", nsuper+1);
+    printf("\tmax supernode size = %d\n", max_sup_size);
+    printf("\tno of size 1 supernodes = %d\n", nsup1);
+
+    /* Histogram of the supernode sizes */
+    ifill (bucket, NBUCKS, 0);
+
+    for (i = 0; i <= nsuper; i++) {
+        isize = xsup[i+1] - xsup[i];
+        whichb = (float) isize / max_sup_size * NBUCKS;
+        if (whichb >= NBUCKS) whichb = NBUCKS - 1;
+        bucket[whichb]++;
+    }
+    
+    printf("\tHistogram of supernode sizes:\n");
+    for (i = 0; i < NBUCKS; i++) {
+        bl = (float) i * max_sup_size / NBUCKS;
+        bh = (float) (i+1) * max_sup_size / NBUCKS;
+        printf("\tsnode: %d-%d\t\t%d\n", bl+1, bh, bucket[i]);
+    }
+
+}
+
+
+float SpaSize(int n, int np, float sum_npw)
+{
+    return (sum_npw*8 + np*8 + n*4)/1024.;
+}
+
+float DenseSize(int n, float sum_nw)
+{
+    return (sum_nw*8 + n*8)/1024.;;
+}
+
+
+
+/*
+ * Check whether repfnz[] == EMPTY after reset.
+ */
+void check_repfnz(int n, int w, int jcol, int *repfnz)
+{
+    int jj, k;
+
+    for (jj = jcol; jj < jcol+w; jj++) 
+	for (k = 0; k < n; k++)
+	    if ( repfnz[(jj-jcol)*n + k] != EMPTY ) {
+		fprintf(stderr, "col %d, repfnz_col[%d] = %d\n", jj,
+			k, repfnz[(jj-jcol)*n + k]);
+		ABORT("check_repfnz");
+	    }
+}
+
+
+/* Print a summary of the testing results. */
+void
+PrintSumm(char *type, int nfail, int nrun, int nerrs)
+{
+    if ( nfail > 0 )
+	printf("%3s driver: %d out of %d tests failed to pass the threshold\n",
+	       type, nfail, nrun);
+    else
+	printf("All tests for %3s driver passed the threshold (%6d tests run)\n", type, nrun);
+
+    if ( nerrs > 0 )
+	printf("%6d error messages recorded\n", nerrs);
+}
+
+
+int print_int_vec(char *what, int n, int *vec)
+{
+    int i;
+    printf("%s\n", what);
+    for (i = 0; i < n; ++i) printf("%d\t%d\n", i, vec[i]);
+    return 0;
+}
diff --git a/intern/opennl/superlu/util.h b/intern/opennl/superlu/util.h
new file mode 100644
index 00000000000..1a3526d4e7e
--- /dev/null
+++ b/intern/opennl/superlu/util.h
@@ -0,0 +1,267 @@
+#ifndef __SUPERLU_UTIL /* allow multiple inclusions */
+#define __SUPERLU_UTIL
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+/*
+#ifndef __STDC__
+#include <malloc.h>
+#endif
+*/
+#include <assert.h>
+
+/***********************************************************************
+ * Macros
+ ***********************************************************************/
+#define FIRSTCOL_OF_SNODE(i)	(xsup[i])
+/* No of marker arrays used in the symbolic factorization,
+   each of size n */
+#define NO_MARKER     3
+#define NUM_TEMPV(m,w,t,b)  ( SUPERLU_MAX(m, (t + b)*w) )
+
+#ifndef USER_ABORT
+#define USER_ABORT(msg) superlu_abort_and_exit(msg)
+#endif
+
+#define ABORT(err_msg) \
+ { char msg[256];\
+   sprintf(msg,"%s at line %d in file %s\n",err_msg,__LINE__, __FILE__);\
+   USER_ABORT(msg); }
+
+
+#ifndef USER_MALLOC
+#if 1
+#define USER_MALLOC(size) superlu_malloc(size)
+#else
+/* The following may check out some uninitialized data */
+#define USER_MALLOC(size) memset (superlu_malloc(size), '\x0F', size)
+#endif
+#endif
+
+#define SUPERLU_MALLOC(size) USER_MALLOC(size)
+
+#ifndef USER_FREE
+#define USER_FREE(addr) superlu_free(addr)
+#endif
+
+#define SUPERLU_FREE(addr) USER_FREE(addr)
+
+#define CHECK_MALLOC(where) {                 \
+    extern int superlu_malloc_total;        \
+    printf("%s: malloc_total %d Bytes\n",     \
+	   where, superlu_malloc_total); \
+}
+
+#define SUPERLU_MAX(x, y) 	( (x) > (y) ? (x) : (y) )
+#define SUPERLU_MIN(x, y) 	( (x) < (y) ? (x) : (y) )
+
+/***********************************************************************
+ * Constants 
+ ***********************************************************************/
+#define EMPTY	(-1)
+/*#define NO	(-1)*/
+#define FALSE	0
+#define TRUE	1
+
+/***********************************************************************
+ * Enumerate types
+ ***********************************************************************/
+typedef enum {NO, YES}                                          yes_no_t;
+typedef enum {DOFACT, SamePattern, SamePattern_SameRowPerm, FACTORED} fact_t;
+typedef enum {NOROWPERM, LargeDiag, MY_PERMR}                   rowperm_t;
+typedef enum {NATURAL, MMD_ATA, MMD_AT_PLUS_A, COLAMD, MY_PERMC}colperm_t;
+typedef enum {NOTRANS, TRANS, CONJ}                             trans_t;
+typedef enum {NOEQUIL, ROW, COL, BOTH}                          DiagScale_t;
+typedef enum {NOREFINE, SINGLE=1, SLU_DOUBLE, EXTRA}                IterRefine_t;
+typedef enum {LUSUP, UCOL, LSUB, USUB}                          MemType;
+typedef enum {HEAD, TAIL}                                       stack_end_t;
+typedef enum {SYSTEM, USER}                                     LU_space_t;
+
+/* 
+ * The following enumerate type is used by the statistics variable 
+ * to keep track of flop count and time spent at various stages.
+ *
+ * Note that not all of the fields are disjoint.
+ */
+typedef enum {
+    COLPERM, /* find a column ordering that minimizes fills */
+    RELAX,   /* find artificial supernodes */
+    ETREE,   /* compute column etree */
+    EQUIL,   /* equilibrate the original matrix */
+    FACT,    /* perform LU factorization */
+    RCOND,   /* estimate reciprocal condition number */
+    SOLVE,   /* forward and back solves */
+    REFINE,  /* perform iterative refinement */
+    SLU_FLOAT,   /* time spent in floating-point operations */
+    TRSV,    /* fraction of FACT spent in xTRSV */
+    GEMV,    /* fraction of FACT spent in xGEMV */
+    FERR,    /* estimate error bounds after iterative refinement */
+    NPHASES  /* total number of phases */
+} PhaseType;
+
+
+/***********************************************************************
+ * Type definitions
+ ***********************************************************************/
+typedef float    flops_t;
+typedef unsigned char Logical;
+
+/* 
+ *-- This contains the options used to control the solve process.
+ *
+ * Fact   (fact_t)
+ *        Specifies whether or not the factored form of the matrix
+ *        A is supplied on entry, and if not, how the matrix A should
+ *        be factorizaed.
+ *        = DOFACT: The matrix A will be factorized from scratch, and the
+ *             factors will be stored in L and U.
+ *        = SamePattern: The matrix A will be factorized assuming
+ *             that a factorization of a matrix with the same sparsity
+ *             pattern was performed prior to this one. Therefore, this
+ *             factorization will reuse column permutation vector 
+ *             ScalePermstruct->perm_c and the column elimination tree
+ *             LUstruct->etree.
+ *        = SamePattern_SameRowPerm: The matrix A will be factorized
+ *             assuming that a factorization of a matrix with the same
+ *             sparsity	pattern and similar numerical values was performed
+ *             prior to this one. Therefore, this factorization will reuse
+ *             both row and column scaling factors R and C, and the
+ *             both row and column permutation vectors perm_r and perm_c,
+ *             distributed data structure set up from the previous symbolic
+ *             factorization.
+ *        = FACTORED: On entry, L, U, perm_r and perm_c contain the 
+ *              factored form of A. If DiagScale is not NOEQUIL, the matrix
+ *              A has been equilibrated with scaling factors R and C.
+ *
+ * Equil  (yes_no_t)
+ *        Specifies whether to equilibrate the system (scale A's row and
+ *        columns to have unit norm).
+ *
+ * ColPerm (colperm_t)
+ *        Specifies what type of column permutation to use to reduce fill.
+ *        = NATURAL: use the natural ordering 
+ *        = MMD_ATA: use minimum degree ordering on structure of A'*A
+ *        = MMD_AT_PLUS_A: use minimum degree ordering on structure of A'+A
+ *        = COLAMD: use approximate minimum degree column ordering
+ *        = MY_PERMC: use the ordering specified in ScalePermstruct->perm_c[]
+ *         
+ * Trans  (trans_t)
+ *        Specifies the form of the system of equations:
+ *        = NOTRANS: A * X = B        (No transpose)
+ *        = TRANS:   A**T * X = B     (Transpose)
+ *        = CONJ:    A**H * X = B     (Transpose)
+ *
+ * IterRefine (IterRefine_t)
+ *        Specifies whether to perform iterative refinement.
+ *        = NO: no iterative refinement
+ *        = WorkingPrec: perform iterative refinement in working precision
+ *        = ExtraPrec: perform iterative refinement in extra precision
+ *
+ * PrintStat (yes_no_t)
+ *        Specifies whether to print the solver's statistics.
+ *
+ * DiagPivotThresh (double, in [0.0, 1.0]) (only for sequential SuperLU)
+ *        Specifies the threshold used for a diagonal entry to be an
+ *        acceptable pivot.
+ *
+ * PivotGrowth (yes_no_t)
+ *        Specifies whether to compute the reciprocal pivot growth.
+ *
+ * ConditionNumber (ues_no_t)
+ *        Specifies whether to compute the reciprocal condition number.
+ *
+ * RowPerm (rowperm_t) (only for SuperLU_DIST)
+ *        Specifies whether to permute rows of the original matrix.
+ *        = NO: not to permute the rows
+ *        = LargeDiag: make the diagonal large relative to the off-diagonal
+ *        = MY_PERMR: use the permutation given in ScalePermstruct->perm_r[]
+ *           
+ * ReplaceTinyPivot (yes_no_t) (only for SuperLU_DIST)
+ *        Specifies whether to replace the tiny diagonals by
+ *        sqrt(epsilon)*||A|| during LU factorization.
+ *
+ * SolveInitialized (yes_no_t) (only for SuperLU_DIST)
+ *        Specifies whether the initialization has been performed to the
+ *        triangular solve.
+ *
+ * RefineInitialized (yes_no_t) (only for SuperLU_DIST)
+ *        Specifies whether the initialization has been performed to the
+ *        sparse matrix-vector multiplication routine needed in iterative
+ *        refinement.
+ */
+typedef struct {
+    fact_t        Fact;
+    yes_no_t      Equil;
+    colperm_t     ColPerm;
+    trans_t       Trans;
+    IterRefine_t  IterRefine;
+    yes_no_t      PrintStat;
+    yes_no_t      SymmetricMode;
+    double        DiagPivotThresh;
+    yes_no_t      PivotGrowth;
+    yes_no_t      ConditionNumber;
+    rowperm_t     RowPerm;
+    yes_no_t      ReplaceTinyPivot;
+    yes_no_t      SolveInitialized;
+    yes_no_t      RefineInitialized;
+} superlu_options_t;
+
+typedef struct {
+    int     *panel_histo; /* histogram of panel size distribution */
+    double  *utime;       /* running time at various phases */
+    flops_t *ops;         /* operation count at various phases */
+    int     TinyPivots;   /* number of tiny pivots */
+    int     RefineSteps;  /* number of iterative refinement steps */
+} SuperLUStat_t;
+
+
+/***********************************************************************
+ * Prototypes
+ ***********************************************************************/
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void    Destroy_SuperMatrix_Store(SuperMatrix *);
+extern void    Destroy_CompCol_Matrix(SuperMatrix *);
+extern void    Destroy_CompRow_Matrix(SuperMatrix *);
+extern void    Destroy_SuperNode_Matrix(SuperMatrix *);
+extern void    Destroy_CompCol_Permuted(SuperMatrix *);
+extern void    Destroy_Dense_Matrix(SuperMatrix *);
+extern void    get_perm_c(int, SuperMatrix *, int *);
+extern void    set_default_options(superlu_options_t *options);
+extern void    sp_preorder (superlu_options_t *, SuperMatrix*, int*, int*,
+			    SuperMatrix*);
+extern void    superlu_abort_and_exit(char*);
+extern void    *superlu_malloc (size_t);
+extern int     *intMalloc (int);
+extern int     *intCalloc (int);
+extern void    superlu_free (void*);
+extern void    SetIWork (int, int, int, int *, int **, int **, int **,
+                         int **, int **, int **, int **);
+extern int     sp_coletree (int *, int *, int *, int, int, int *);
+extern void    relax_snode (const int, int *, const int, int *, int *);
+extern void    heap_relax_snode (const int, int *, const int, int *, int *);
+extern void    resetrep_col (const int, const int *, int *);
+extern int     spcoletree (int *, int *, int *, int, int, int *);
+extern int     *TreePostorder (int, int *);
+extern double  SuperLU_timer_ ();
+extern int     sp_ienv (int);
+extern int     lsame_ (char *, char *);
+extern int     xerbla_ (char *, int *);
+extern void    ifill (int *, int, int);
+extern void    snode_profile (int, int *);
+extern void    super_stats (int, int *);
+extern void    PrintSumm (char *, int, int, int);
+extern void    StatInit(SuperLUStat_t *);
+extern void    StatPrint (SuperLUStat_t *);
+extern void    StatFree(SuperLUStat_t *);
+extern void    print_panel_seg(int, int, int, int, int *, int *);
+extern void    check_repfnz(int, int, int, int *);
+
+#ifdef __cplusplus
+  }
+#endif
+
+#endif /* __SUPERLU_UTIL */
diff --git a/intern/opennl/superlu/xerbla.c b/intern/opennl/superlu/xerbla.c
new file mode 100644
index 00000000000..cb94fa71d95
--- /dev/null
+++ b/intern/opennl/superlu/xerbla.c
@@ -0,0 +1,43 @@
+
+#include <stdio.h>
+
+/* Subroutine */ int xerbla_(char *srname, int *info)
+{
+/*  -- LAPACK auxiliary routine (version 2.0) --   
+       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
+       Courant Institute, Argonne National Lab, and Rice University   
+       September 30, 1994   
+
+
+    Purpose   
+    =======   
+
+    XERBLA  is an error handler for the LAPACK routines.   
+    It is called by an LAPACK routine if an input parameter has an   
+    invalid value.  A message is printed and execution stops.   
+
+    Installers may consider modifying the STOP statement in order to   
+    call system-specific exception-handling facilities.   
+
+    Arguments   
+    =========   
+
+    SRNAME  (input) CHARACTER*6   
+            The name of the routine which called XERBLA.   
+
+    INFO    (input) INT   
+            The position of the invalid parameter in the parameter list   
+
+            of the calling routine.   
+
+   ===================================================================== 
+*/
+
+    printf("** On entry to %6s, parameter number %2d had an illegal value\n",
+		srname, *info);
+
+/*     End of XERBLA */
+
+    return 0;
+} /* xerbla_ */
+
diff --git a/source/Makefile b/source/Makefile
index a952fd35274..ccf4bb53043 100644
--- a/source/Makefile
+++ b/source/Makefile
@@ -87,6 +87,8 @@ PYPLAYERLIB ?= $(PYLIB)
     GRPLIB += $(OCGDIR)/blender/renderconverter/$(DEBUG_DIR)librenderconverter.a
     GRPLIB += $(OCGDIR)/blender/render/$(DEBUG_DIR)librender.a
     GRPLIB += $(OCGDIR)/blender/radiosity/$(DEBUG_DIR)libradiosity.a
+    GRPLIB += $(NAN_OPENNL)/lib/$(DEBUG_DIR)libopennl.a
+    GRPLIB += $(NAN_SUPERLU)/lib/$(DEBUG_DIR)libsuperlu.a
 
     GRPLIB += $(OCGDIR)/blender/python/$(DEBUG_DIR)libpython.a
 
diff --git a/source/nan_definitions.mk b/source/nan_definitions.mk
index 6156203d87b..697c9093d58 100644
--- a/source/nan_definitions.mk
+++ b/source/nan_definitions.mk
@@ -81,6 +81,8 @@ endif
     export NAN_GHOST ?= $(LCGDIR)/ghost
     export NAN_TEST_VERBOSITY ?= 1
     export NAN_BMFONT ?= $(LCGDIR)/bmfont
+    export NAN_OPENNL ?= $(LCGDIR)/opennl
+    export NAN_SUPERLU ?= $(LCGDIR)/superlu
     ifeq ($(FREE_WINDOWS), true)
       export NAN_FTGL ?= $(LCGDIR)/gcc/ftgl
     else