From 6113b4cd262bb6068141c5eefe60dafaea59beef Mon Sep 17 00:00:00 2001
From: dwight <dwight@10gen.com>
Date: Thu, 24 Feb 2011 13:34:42 -0500
Subject: [PATCH] change to a faster checksum (old was md5).  new is less
 discriminating but fast. also, now includes the JSectHeader in the checksum.

---
 db/dur_journal.cpp           | 26 +++++++++++------------
 db/dur_journalformat.h       |  2 +-
 db/dur_preplogbuffer.cpp     | 14 ++++++------
 db/dur_recover.cpp           |  7 ++----
 dbtests/perftests.cpp        | 41 +++++++++++++++++++++++++++++++++++-
 dbtests/test.vcxproj         |  1 +
 dbtests/test.vcxproj.filters |  3 +++
 util/checksum.h              | 37 ++++++++++++++++++++++++++++++++
 8 files changed, 103 insertions(+), 28 deletions(-)
 create mode 100644 util/checksum.h

diff --git a/db/dur_journal.cpp b/db/dur_journal.cpp
index 96973c17310..9335868ca4a 100644
--- a/db/dur_journal.cpp
+++ b/db/dur_journal.cpp
@@ -33,7 +33,7 @@
 #include "../util/mongoutils/str.h"
 #include "dur_journalimpl.h"
 #include "../util/file.h"
-#include "../util/md5.hpp"
+#include "../util/checksum.h"
 
 using namespace mongoutils;
 
@@ -42,6 +42,7 @@ namespace mongo {
     class AlignedBuilder;
 
     namespace dur {
+        BOOST_STATIC_ASSERT( sizeof(Checksum) == 16 );
         BOOST_STATIC_ASSERT( sizeof(JHeader) == 8192 );
         BOOST_STATIC_ASSERT( sizeof(JSectHeader) == 20 );
         BOOST_STATIC_ASSERT( sizeof(JSectFooter) == 32 );
@@ -81,22 +82,19 @@ namespace mongo {
             reserved = 0;
             magic[0] = magic[1] = magic[2] = magic[3] = '\n';
 
-            // skip section header since size modified after hashing
-            (const char*&)begin += sizeof(JSectHeader);
-            len                 -= sizeof(JSectHeader);
-
-            md5(begin, len, hash);
+            Checksum c;
+            c.gen(begin, (unsigned) len);
+            memcpy(hash, c.bytes, sizeof(hash));
         }
 
         bool JSectFooter::checkHash(const void* begin, int len) const {
-            // skip section header since size modified after hashing
-            // todo: skipping the header must be fixed, as we won't catch corruption of it then...
-            (const char*&)begin += sizeof(JSectHeader);
-            len                 -= sizeof(JSectHeader);
-            md5digest current;
-            md5(begin, len, current);
-            DEV log() << "checkHash len:" << len << " hash:" << toHex(hash, 16) << " current:" << toHex(current, 16) << endl;
-            return (memcmp(hash, current, sizeof(hash)) == 0);
+            Checksum c;
+            c.gen(begin, len);
+            DEV log() << "checkHash len:" << len << " hash:" << toHex(hash, 16) << " current:" << toHex(c.bytes, 16) << endl;
+            if( memcmp(hash, c.bytes, sizeof(hash)) == 0 ) 
+                return true;
+            log() << "dur checkHash mismatch, got: " << toHex(c.bytes, 16) << " expected: " << toHex(hash,16) << endl;
+            return false;
         }
 
         JHeader::JHeader(string fname) {
diff --git a/db/dur_journalformat.h b/db/dur_journalformat.h
index 4b60afa0854..72587ccd7b6 100644
--- a/db/dur_journalformat.h
+++ b/db/dur_journalformat.h
@@ -34,7 +34,7 @@ namespace mongo {
 
             // x4142 is asci--readable if you look at the file with head/less -- thus the starting values were near
             // that.  simply incrementing the version # is safe on a fwd basis.
-            enum { CurrentVersion = 0x4147 };
+            enum { CurrentVersion = 0x4148 };
             unsigned short _version;
 
             // these are just for diagnostic ease (make header more useful as plain text)
diff --git a/db/dur_preplogbuffer.cpp b/db/dur_preplogbuffer.cpp
index 1648e899cb8..c1f6903c088 100644
--- a/db/dur_preplogbuffer.cpp
+++ b/db/dur_preplogbuffer.cpp
@@ -161,19 +161,19 @@ namespace mongo {
                 prepBasicWrites(bb);
             }
 
+            // pad to alignment, and set the total section length in the JSectHeader
+            assert( 0xffffe000 == (~(Alignment-1)) );
+            unsigned lenWillBe = bb.len() + sizeof(JSectFooter);
+            unsigned L = (lenWillBe + Alignment-1) & (~(Alignment-1));
+            dassert( L >= lenWillBe );
+            *((unsigned*)bb.atOfs(0)) = L;
+
             {
                 JSectFooter f(bb.buf(), bb.len());
                 bb.appendStruct(f);
             }
 
             {
-                // pad to alignment, and set the total section length in the JSectHeader
-                assert( 0xffffe000 == (~(Alignment-1)) );
-                unsigned L = (bb.len() + Alignment-1) & (~(Alignment-1));
-                dassert( L >= (unsigned) bb.len() );
-
-                *((unsigned*)bb.atOfs(0)) = L;
-
                 unsigned padding = L - bb.len();
                 bb.skip(padding);
                 dassert( bb.len() % Alignment == 0 );
diff --git a/db/dur_recover.cpp b/db/dur_recover.cpp
index ae19d7ad871..a42c38b09a0 100644
--- a/db/dur_recover.cpp
+++ b/db/dur_recover.cpp
@@ -30,10 +30,10 @@
 #include "database.h"
 #include "db.h"
 #include "../util/unittest.h"
+#include "../util/checksum.h"
 #include "cmdline.h"
 #include "curop.h"
 #include "mongommf.h"
-#include "../util/md5.hpp"
 
 #include <sys/stat.h>
 #include <fcntl.h>
@@ -120,10 +120,7 @@ namespace mongo {
                             const JSectFooter& footer = *(const JSectFooter*)pos;
                             int len = pos - (char*)_sectHead;
                             if (!footer.checkHash(_sectHead, len)) {
-                                massert(13594, str::stream() << "Journal checksum doesn't match. recorded: "
-                                        << toHex(footer.hash, sizeof(footer.hash))
-                                        << " actual: " << md5simpledigest(_sectHead, len)
-                                        , false);
+                                massert(13594, "dur journal checksum doesn't match", false);
                             }
                         }
                         return false; // false return value denotes end of section
diff --git a/dbtests/perftests.cpp b/dbtests/perftests.cpp
index 2ac7d4671cf..7ede0371ca0 100644
--- a/dbtests/perftests.cpp
+++ b/dbtests/perftests.cpp
@@ -33,6 +33,7 @@
 #include "../util/timer.h"
 #include "dbtests.h"
 #include "../db/dur_stats.h"
+#include "../util/checksum.h"
 
 namespace PerfTests {
     typedef DBDirectClient DBClientType;
@@ -64,6 +65,43 @@ namespace PerfTests {
     };
     DBClientType ClientBase::_client;
 
+    class Checksum {
+    public:
+        void run() {
+            {
+                // the checksum code assumes 'standard' rollover on addition overflows. let's check that:
+                unsigned long long x = 0xffffffffffffffffUL;
+                ASSERT( x+2 == 1 );
+            }
+
+            unsigned sz = 1024 * 1024 * 100 + 3;
+            void *p = malloc(sz);
+            mongo::Checksum last;
+            for( int i = 0; i < 4; i++ ) { 
+                Timer t;
+                mongo::Checksum c;
+                c.gen(p, sz);
+                cout << "checksum " << t.millis() << "ms" << endl;
+                ASSERT( i == 0 || c == last );
+                last = c;
+            }
+            {
+                mongo::Checksum c;
+                c.gen(p, sz-1);
+                ASSERT( c != last );
+                ((char *&)p)[0]++; // check same data, different order, doesn't give same checksum
+                ((char *&)p)[1]--;
+                c.gen(p, sz);
+                ASSERT( c != last );
+                ((char *&)p)[1]++; // check same data, different order, doesn't give same checksum (different longwords case)
+                ((char *&)p)[8]--;
+                c.gen(p, sz);
+                ASSERT( c != last );
+            }
+            free(p);
+        }
+    };
+
     // todo: use a couple threads. not a very good test yet.
     class TaskQueueTest {
         static int tot;
@@ -110,7 +148,7 @@ namespace PerfTests {
         virtual void post() { }
         virtual string name() = 0;
         virtual unsigned long long expectation() = 0;
-        virtual int howLongMillis() { return 5000; }
+        virtual int howLongMillis() { return 5000; } // how long to run test
     public:
         void say(unsigned long long n, int ms, string s) {
             //cout << setw(36) << left << s << ' ' << right << setw(7) << n*1000/ms << "/sec   " << setw(4) << ms << "ms" << endl;
@@ -325,6 +363,7 @@ namespace PerfTests {
         }
 
         void setupTests() {
+            add< Checksum >();
             add< TaskQueueTest >();
             cout << "stats\t" 
                 << "test\trps\ttime\t"
diff --git a/dbtests/test.vcxproj b/dbtests/test.vcxproj
index 8ff5aabcd07..dc55dce0c89 100644
--- a/dbtests/test.vcxproj
+++ b/dbtests/test.vcxproj
@@ -242,6 +242,7 @@
     <ClInclude Include="..\db\scanandorder.h" />
     <ClInclude Include="..\db\security.h" />
     <ClInclude Include="..\util\builder.h" />
+    <ClInclude Include="..\util\checksum.h" />
     <ClInclude Include="..\util\concurrency\list.h" />
     <ClInclude Include="..\util\concurrency\task.h" />
     <ClInclude Include="..\util\concurrency\value.h" />
diff --git a/dbtests/test.vcxproj.filters b/dbtests/test.vcxproj.filters
index 6bbc15918d8..f3b41fb7031 100755
--- a/dbtests/test.vcxproj.filters
+++ b/dbtests/test.vcxproj.filters
@@ -247,6 +247,9 @@
     <ClInclude Include="..\util\mongoutils\hash.h">
       <Filter>util\h</Filter>
     </ClInclude>
+    <ClInclude Include="..\util\checksum.h">
+      <Filter>util</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <Library Include="..\..\js\js64r.lib">
diff --git a/util/checksum.h b/util/checksum.h
new file mode 100644
index 00000000000..009ab56fbeb
--- /dev/null
+++ b/util/checksum.h
@@ -0,0 +1,37 @@
+#pragma once
+#include "../pch.h"
+namespace mongo {
+    /** a simple, rather dumb, but very fast checksum.  see perftests.cpp for unit tests. */
+    struct Checksum { 
+        union { 
+            unsigned char bytes[16];
+            unsigned long long words[2];
+        };
+
+        // if you change this you must bump dur::CurrentVersion
+        void gen(const void *buf, unsigned len) {
+            wassert( ((size_t)buf) % 8 == 0 ); // performance warning
+            unsigned n = len / 8 / 2;
+            const unsigned long long *p = (const unsigned long long *) buf;
+            unsigned long long a = 0;
+            for( unsigned i = 0; i < n; i++ ) {
+                a += (*p ^ i);
+                p++;
+            }
+            unsigned long long b = 0;
+            for( unsigned i = 0; i < n; i++ ) {
+                b += (*p ^ i);
+                p++;
+            }
+            unsigned long long c = 0;
+            for( unsigned i = n * 2 * 8; i < len; i++ ) { // 0-7 bytes left
+                c = (c << 8) | ((const char *)buf)[i];
+            }
+            words[0] = a ^ len;
+            words[1] = b ^ c;
+        }
+
+        bool operator==(const Checksum& rhs) const { return words[0]==rhs.words[0] && words[1]==rhs.words[1]; }
+        bool operator!=(const Checksum& rhs) const { return words[0]!=rhs.words[0] || words[1]!=rhs.words[1]; }
+    };
+}