Description | Here is a .diff that modifies ncx.c. Basically, I am trying to move 32 bit words, rather than bytes (as originally done). This is generally for the Exodus reader.
An optimized version of this code changed from 37.7 seconds (4.3.1) to 23.9 seconds (master, these changes applied) to load a dataset (that I may not share), containing 950,000 cells and 1,400,000 points. This files mods were by far the biggest change.
Note that this is in the netcdf directories. Please ask Utkarsh before applying this patch.
Note to self, OUO tire, Linux.
Test by opening any Exodus file. |
Attached Files | ncx.c.diff [^] (4,179 bytes) 2015-07-06 20:53 [Show Content] [Hide Content]diff --git a/ThirdParty/netcdf/vtknetcdf/libsrc/ncx.c b/ThirdParty/netcdf/vtknetcdf/libsrc/ncx.c
index 5156fd1..063b6d7 100644
--- a/ThirdParty/netcdf/vtknetcdf/libsrc/ncx.c
+++ b/ThirdParty/netcdf/vtknetcdf/libsrc/ncx.c
@@ -145,12 +145,39 @@ swapn2b(void *dst, const void *src, size_t nn)
static void
swap4b(void *dst, const void *src)
{
+
+/*
char *op = dst;
const char *ip = src;
op[0] = ip[3];
op[1] = ip[2];
op[2] = ip[1];
op[3] = ip[0];
+
+*/
+//
+// Instead of loading and storing bytes, lets just load
+// unsigned ints. We move the bits into correct position, then
+// store it out.
+// There is a trick that this library is pulling that is fooling
+// the optimizer, We are trying to read 4 byte floats, and
+// return 8 byte doubles. This happens in calling routines
+// above here. Anyway, we must cast the output to a four byte
+// float to get the optimizer to work correctly. Go figure.
+//
+ unsigned int *op = dst;
+ const char *ip = src;
+ unsigned int tempIn;
+ unsigned int tempOut;
+
+ tempIn = *(unsigned int *)(ip+0);
+ tempOut =
+ ( tempIn << 24) |
+ ((tempIn & 0x0000ff00) << 8) |
+ ((tempIn & 0x00ff0000) >> 8) |
+ ( tempIn >> 24);
+
+ *(float *)op = *(float *)(&tempOut);
}
# endif /* !vax */
@@ -171,6 +198,48 @@ swapn4b(void *dst, const void *src, size_t nn)
* ip += 4;
* }
*/
+//
+// Instead of moving bytes, lets move works - and set the bits into
+// the correct position through shifts.
+// Assuming that we can just pick up mod 4 number of inputs inefficiently.
+//
+ while(nn > 3)
+ {
+ unsigned int tempIn;
+
+ tempIn = *(unsigned int *)(ip+0);
+ *(unsigned int *)(op+0) =
+ ((tempIn ) << 24) |
+ ((tempIn & 0x0000ff00) << 8) |
+ ((tempIn & 0x00ff0000) >> 8) |
+ ((tempIn ) >> 24);
+
+ tempIn = *(unsigned int *)(ip+4);
+ *(unsigned int *)(op+4) =
+ ((tempIn ) << 24) |
+ ((tempIn & 0x0000ff00) << 8) |
+ ((tempIn & 0x00ff0000) >> 8) |
+ ((tempIn ) >> 24);
+
+ tempIn = *(unsigned int *)(ip+8);
+ *(unsigned int *)(op+8) =
+ ((tempIn ) << 24) |
+ ((tempIn & 0x0000ff00) << 8) |
+ ((tempIn & 0x00ff0000) >> 8) |
+ ((tempIn ) >> 24);
+
+ tempIn = *(unsigned int *)(ip+12);
+ *(unsigned int *)(op+12) =
+ ((tempIn ) << 24) |
+ ((tempIn & 0x0000ff00) << 8) |
+ ((tempIn & 0x00ff0000) >> 8) |
+ ((tempIn ) >> 24);
+
+ op += 16;
+ ip += 16;
+ nn -= 4;
+ }
+/*
while(nn > 3)
{
op[0] = ip[3];
@@ -193,6 +262,7 @@ swapn4b(void *dst, const void *src, size_t nn)
ip += 16;
nn -= 4;
}
+*/
while(nn-- != 0)
{
op[0] = ip[3];
@@ -254,9 +324,47 @@ swapn8b(void *dst, const void *src, size_t nn)
* ip += 8;
* }
*/
+
# ifndef FLOAT_WORDS_BIGENDIAN
+
+//
+// Instead of moving bytes, lets move works - and set the bits into
+// the correct position through shifts.
+// Assuming that we can just pick up mod 4 number of inputs inefficiently.
+//
+ while(nn > 1)
+ {
+ ulong tempIn;
+
+ tempIn = *(ulong*)(ip+0);
+ *(ulong*)(op+0) =
+ ((tempIn ) << 56) |
+ ((tempIn & 0x000000000000ff00) << 40) |
+ ((tempIn & 0x0000000000ff0000) << 24) |
+ ((tempIn & 0x00000000ff000000) << 8) |
+ ((tempIn & 0x000000ff00000000) >> 8) |
+ ((tempIn & 0x0000ff0000000000) >> 24) |
+ ((tempIn & 0x00ff000000000000) >> 40) |
+ ((tempIn ) >> 56);
+ tempIn = *(ulong*)(ip+8);
+ *(ulong*)(op+8) =
+ ((tempIn ) << 56) |
+ ((tempIn & 0x000000000000ff00) << 40) |
+ ((tempIn & 0x0000000000ff0000) << 24) |
+ ((tempIn & 0x00000000ff000000) << 8) |
+ ((tempIn & 0x000000ff00000000) >> 8) |
+ ((tempIn & 0x0000ff0000000000) >> 24) |
+ ((tempIn & 0x00ff000000000000) >> 40) |
+ ((tempIn ) >> 56);
+
+ op += 16;
+ ip += 16;
+ nn -= 2;
+ }
+/*
while(nn > 1)
{
+
op[0] = ip[7];
op[1] = ip[6];
op[2] = ip[5];
@@ -277,6 +385,7 @@ swapn8b(void *dst, const void *src, size_t nn)
ip += 16;
nn -= 2;
}
+*/
while(nn-- != 0)
{
op[0] = ip[7];
Unoptimized.png [^] (1,180,314 bytes) 2015-07-06 20:54
Optimized-1.png [^] (1,144,463 bytes) 2015-07-06 20:54
0001-Performance-tweaks-by-using-word-operations-instead-.patch [^] (7,793 bytes) 2015-07-23 14:55 [Show Content] [Hide Content]From a4b9cc1881b91e4cc8f19569a79662be60dc6509 Mon Sep 17 00:00:00 2001
From: W Alan Scott <wascott@sandia.gov>
Date: Thu, 23 Jul 2015 14:52:46 -0400
Subject: [PATCH] Performance tweaks by using word operations instead of byte.
---
ThirdParty/netcdf/vtknetcdf/libsrc/ncx.c | 197 +++++++++++++++++++------------
1 file changed, 121 insertions(+), 76 deletions(-)
diff --git a/ThirdParty/netcdf/vtknetcdf/libsrc/ncx.c b/ThirdParty/netcdf/vtknetcdf/libsrc/ncx.c
index 5156fd1..368decf 100644
--- a/ThirdParty/netcdf/vtknetcdf/libsrc/ncx.c
+++ b/ThirdParty/netcdf/vtknetcdf/libsrc/ncx.c
@@ -145,12 +145,22 @@ swapn2b(void *dst, const void *src, size_t nn)
static void
swap4b(void *dst, const void *src)
{
- char *op = dst;
- const char *ip = src;
- op[0] = ip[3];
- op[1] = ip[2];
- op[2] = ip[1];
- op[3] = ip[0];
+ // Original implementation
+ // char *op = dst;
+ // const char *ip = src;
+ // op[0] = ip[3];
+ // op[1] = ip[2];
+ // op[2] = ip[1];
+ // op[3] = ip[0];
+
+ // Instead of moving bytes, lets move words - and set the bits into
+ // the correct position through shifts.
+ uint32_t* op32 = (uint32_t*)(dst);
+ const uint32_t* ip32 = (const uint32_t*)(src);
+ *op32 = (((*ip32) & 0x000000ffu) << 24) |
+ (((*ip32) & 0x0000ff00u) << 8) |
+ (((*ip32) & 0x00ff0000u) >> 8) |
+ (((*ip32) & 0xff000000u) >> 24);
}
# endif /* !vax */
@@ -173,32 +183,45 @@ swapn4b(void *dst, const void *src, size_t nn)
*/
while(nn > 3)
{
- op[0] = ip[3];
- op[1] = ip[2];
- op[2] = ip[1];
- op[3] = ip[0];
- op[4] = ip[7];
- op[5] = ip[6];
- op[6] = ip[5];
- op[7] = ip[4];
- op[8] = ip[11];
- op[9] = ip[10];
- op[10] = ip[9];
- op[11] = ip[8];
- op[12] = ip[15];
- op[13] = ip[14];
- op[14] = ip[13];
- op[15] = ip[12];
+ uint32_t* op32 = (uint32_t*)(op);
+ const uint32_t* ip32 = (const uint32_t*)(ip);
+ *op32 = (((*ip32) & 0x000000ffu) << 24) |
+ (((*ip32) & 0x0000ff00u) << 8) |
+ (((*ip32) & 0x00ff0000u) >> 8) |
+ (((*ip32) & 0xff000000u) >> 24);
+
+ op32 = (uint32_t*)(op + 4);
+ ip32 = (const uint32_t*)(ip + 4);
+ *op32 = (((*ip32) & 0x000000ffu) << 24) |
+ (((*ip32) & 0x0000ff00u) << 8) |
+ (((*ip32) & 0x00ff0000u) >> 8) |
+ (((*ip32) & 0xff000000u) >> 24);
+
+ op32 = (uint32_t*)(op + 8);
+ ip32 = (const uint32_t*)(ip + 8);
+ *op32 = (((*ip32) & 0x000000ffu) << 24) |
+ (((*ip32) & 0x0000ff00u) << 8) |
+ (((*ip32) & 0x00ff0000u) >> 8) |
+ (((*ip32) & 0xff000000u) >> 24);
+
+ op32 = (uint32_t*)(op + 12);
+ ip32 = (const uint32_t*)(ip + 12);
+ *op32 = (((*ip32) & 0x000000ffu) << 24) |
+ (((*ip32) & 0x0000ff00u) << 8) |
+ (((*ip32) & 0x00ff0000u) >> 8) |
+ (((*ip32) & 0xff000000u) >> 24);
op += 16;
ip += 16;
nn -= 4;
}
while(nn-- != 0)
{
- op[0] = ip[3];
- op[1] = ip[2];
- op[2] = ip[1];
- op[3] = ip[0];
+ uint32_t* op32 = (uint32_t*)(op);
+ const uint32_t* ip32 = (const uint32_t*)(ip);
+ *op32 = (((*ip32) & 0x000000ffu) << 24) |
+ (((*ip32) & 0x0000ff00u) << 8) |
+ (((*ip32) & 0x00ff0000u) >> 8) |
+ (((*ip32) & 0xff000000u) >> 24);
op += 4;
ip += 4;
}
@@ -208,26 +231,34 @@ swapn4b(void *dst, const void *src, size_t nn)
static void
swap8b(void *dst, const void *src)
{
- char *op = dst;
- const char *ip = src;
# ifndef FLOAT_WORDS_BIGENDIAN
- op[0] = ip[7];
- op[1] = ip[6];
- op[2] = ip[5];
- op[3] = ip[4];
- op[4] = ip[3];
- op[5] = ip[2];
- op[6] = ip[1];
- op[7] = ip[0];
+ uint64_t* op64 = (uint64_t*)(dst);
+ const uint64_t* ip64 = (uint64_t*)(src);
+ *op64 = (((*ip64) ) << 56) |
+ (((*ip64) & 0x000000000000ff00) << 40) |
+ (((*ip64) & 0x0000000000ff0000) << 24) |
+ (((*ip64) & 0x00000000ff000000) << 8) |
+ (((*ip64) & 0x000000ff00000000) >> 8) |
+ (((*ip64) & 0x0000ff0000000000) >> 24) |
+ (((*ip64) & 0x00ff000000000000) >> 40) |
+ (((*ip64) ) >> 56);
# else
- op[0] = ip[3];
- op[1] = ip[2];
- op[2] = ip[1];
- op[3] = ip[0];
- op[4] = ip[7];
- op[5] = ip[6];
- op[6] = ip[5];
- op[7] = ip[4];
+ char *op = dst;
+ const char *ip = src;
+
+ uint32_t* op32 = (uint32_t*)(op);
+ const uint32_t* ip32 = (const uint32_t*)(ip);
+ *op32 = (((*ip32) & 0x000000ffu) << 24) |
+ (((*ip32) & 0x0000ff00u) << 8) |
+ (((*ip32) & 0x00ff0000u) >> 8) |
+ (((*ip32) & 0xff000000u) >> 24);
+
+ op32 = (uint32_t*)(op + 4);
+ ip32 = (const uint32_t*)(ip + 4);
+ *op32 = (((*ip32) & 0x000000ffu) << 24) |
+ (((*ip32) & 0x0000ff00u) << 8) |
+ (((*ip32) & 0x00ff0000u) >> 8) |
+ (((*ip32) & 0xff000000u) >> 24);
# endif
}
# endif /* !vax */
@@ -257,50 +288,64 @@ swapn8b(void *dst, const void *src, size_t nn)
# ifndef FLOAT_WORDS_BIGENDIAN
while(nn > 1)
{
- op[0] = ip[7];
- op[1] = ip[6];
- op[2] = ip[5];
- op[3] = ip[4];
- op[4] = ip[3];
- op[5] = ip[2];
- op[6] = ip[1];
- op[7] = ip[0];
- op[8] = ip[15];
- op[9] = ip[14];
- op[10] = ip[13];
- op[11] = ip[12];
- op[12] = ip[11];
- op[13] = ip[10];
- op[14] = ip[9];
- op[15] = ip[8];
+ uint64_t* op64 = (uint64_t*)(op);
+ const uint64_t* ip64 = (uint64_t*)(ip);
+ *op64 = (((*ip64) ) << 56) |
+ (((*ip64) & 0x000000000000ff00) << 40) |
+ (((*ip64) & 0x0000000000ff0000) << 24) |
+ (((*ip64) & 0x00000000ff000000) << 8) |
+ (((*ip64) & 0x000000ff00000000) >> 8) |
+ (((*ip64) & 0x0000ff0000000000) >> 24) |
+ (((*ip64) & 0x00ff000000000000) >> 40) |
+ (((*ip64) ) >> 56);
+
+ op64 = (uint64_t*)(op+8);
+ ip64 = (const uint64_t*)(ip+8);
+ *op64 = (((*ip64) ) << 56) |
+ (((*ip64) & 0x000000000000ff00) << 40) |
+ (((*ip64) & 0x0000000000ff0000) << 24) |
+ (((*ip64) & 0x00000000ff000000) << 8) |
+ (((*ip64) & 0x000000ff00000000) >> 8) |
+ (((*ip64) & 0x0000ff0000000000) >> 24) |
+ (((*ip64) & 0x00ff000000000000) >> 40) |
+ (((*ip64) ) >> 56);
op += 16;
ip += 16;
nn -= 2;
}
while(nn-- != 0)
{
- op[0] = ip[7];
- op[1] = ip[6];
- op[2] = ip[5];
- op[3] = ip[4];
- op[4] = ip[3];
- op[5] = ip[2];
- op[6] = ip[1];
- op[7] = ip[0];
+ uint64_t* op64 = (uint64_t*)(op);
+ const uint64_t* ip64 = (uint64_t*)(ip);
+ *op64 = (((*ip64) ) << 56) |
+ (((*ip64) & 0x000000000000ff00) << 40) |
+ (((*ip64) & 0x0000000000ff0000) << 24) |
+ (((*ip64) & 0x00000000ff000000) << 8) |
+ (((*ip64) & 0x000000ff00000000) >> 8) |
+ (((*ip64) & 0x0000ff0000000000) >> 24) |
+ (((*ip64) & 0x00ff000000000000) >> 40) |
+ (((*ip64) ) >> 56);
+
op += 8;
ip += 8;
}
# else
while(nn-- != 0)
{
- op[0] = ip[3];
- op[1] = ip[2];
- op[2] = ip[1];
- op[3] = ip[0];
- op[4] = ip[7];
- op[5] = ip[6];
- op[6] = ip[5];
- op[7] = ip[4];
+ uint32_t* op32 = (uint32_t*)(op);
+ const uint32_t* ip32 = (const uint32_t*)(ip);
+ *op32 = (((*ip32) & 0x000000ffu) << 24) |
+ (((*ip32) & 0x0000ff00u) << 8) |
+ (((*ip32) & 0x00ff0000u) >> 8) |
+ (((*ip32) & 0xff000000u) >> 24);
+
+ op32 = (uint32_t*)(op + 4);
+ ip32 = (const uint32_t*)(ip + 4);
+ *op32 = (((*ip32) & 0x000000ffu) << 24) |
+ (((*ip32) & 0x0000ff00u) << 8) |
+ (((*ip32) & 0x00ff0000u) >> 8) |
+ (((*ip32) & 0xff000000u) >> 24);
+
op += 8;
ip += 8;
}
--
1.9.1
|