Files
mongo/lang/python/wiredtiger/intpacking.py
2016-01-01 16:37:39 -05:00

144 lines
5.2 KiB
Python

#!/usr/bin/env python
#
# Public Domain 2014-2016 MongoDB, Inc.
# Public Domain 2008-2014 WiredTiger, Inc.
#
# This is free and unencumbered software released into the public domain.
#
# Anyone is free to copy, modify, publish, use, compile, sell, or
# distribute this software, either in source code form or as a compiled
# binary, for any purpose, commercial or non-commercial, and by any
# means.
#
# In jurisdictions that recognize copyright laws, the author or authors
# of this software dedicate any and all copyright interest in the
# software to the public domain. We make this dedication for the benefit
# of the public at large and to the detriment of our heirs and
# successors. We intend this dedication to be an overt act of
# relinquishment in perpetuity of all present and future rights to this
# software under copyright law.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#
import math, struct
# Variable-length integer packing
# need: up to 64 bits, both signed and unsigned
#
# Try hard for small values (up to ~2 bytes), after that, just encode the
# length in the first byte.
#
# First byte | Next | |
# byte | bytes| Min Value | Max Value
# ------------+------+------------------------+--------------------------------
# [00 00xxxx] | free | N/A | N/A
# [00 01llll] | 8-l | -2^64 | -2^13 - 2^6
# [00 1xxxxx] | 1 | -2^13 - 2^6 | -2^6 - 1
# [01 xxxxxx] | 0 | -2^6 | -1
# [10 xxxxxx] | 0 | 0 | 2^6 - 1
# [11 0xxxxx] | 1 | 2^6 | 2^13 + 2^6 - 1
# [11 10llll] | l | 2^14 + 2^7 | 2^64 - 1
# [11 11xxxx] | free | N/A | N/A
NEG_MULTI_MARKER = 0x10
NEG_2BYTE_MARKER = 0x20
NEG_1BYTE_MARKER = 0x40
POS_1BYTE_MARKER = 0x80
POS_2BYTE_MARKER = 0xc0
POS_MULTI_MARKER = 0xe0
NEG_1BYTE_MIN = -2**6
NEG_2BYTE_MIN = -2**13 + NEG_1BYTE_MIN
POS_1BYTE_MAX = 2**6 - 1
POS_2BYTE_MAX = 2**13 + POS_1BYTE_MAX
MINUS_BIT = -1 << 64
UINT64_MASK = 0xffffffffffffffff
def getbits(x, start, end=0):
'''return the least significant bits of x, from start to end'''
return (x & ((1 << start) - 1)) >> (end)
def get_int(b, size):
r = 0
for i in xrange(size):
r = (r << 8) | ord(b[i])
return r
def pack_int(x):
if x < NEG_2BYTE_MIN:
packed = struct.pack('>Q', x & UINT64_MASK)
while packed and packed[0] == '\xff':
packed = packed[1:]
return chr(NEG_MULTI_MARKER | getbits(8 - len(packed), 4)) + packed
elif x < NEG_1BYTE_MIN:
x -= NEG_2BYTE_MIN
return chr(NEG_2BYTE_MARKER | getbits(x, 13, 8)) + chr(getbits(x, 8))
elif x < 0:
x -= NEG_1BYTE_MIN
return chr(NEG_1BYTE_MARKER | getbits(x, 6))
elif x <= POS_1BYTE_MAX:
return chr(POS_1BYTE_MARKER | getbits(x, 6))
elif x <= POS_2BYTE_MAX:
x -= (POS_1BYTE_MAX + 1)
return chr(POS_2BYTE_MARKER | getbits(x, 13, 8)) + chr(getbits(x, 8))
elif x == POS_2BYTE_MAX + 1:
# This is a special case where we could store the value with
# just a single byte, but we append a zero byte so that the
# encoding doesn't get shorter for this one value.
return chr(POS_MULTI_MARKER | 0x1) + chr(0)
else:
packed = struct.pack('>Q', x - (POS_2BYTE_MAX + 1))
while packed and packed[0] == '\x00':
packed = packed[1:]
return chr(POS_MULTI_MARKER | getbits(len(packed), 4)) + packed
def unpack_int(b):
marker = ord(b[0])
if marker < NEG_2BYTE_MARKER:
sz = 8 - getbits(marker, 4)
return ((-1 << (sz << 3)) | get_int(b[1:], sz), b[sz+1:])
elif marker < NEG_1BYTE_MARKER:
return (NEG_2BYTE_MIN + ((getbits(marker, 5) << 8) | ord(b[1])), b[2:])
elif marker < POS_1BYTE_MARKER:
return (NEG_1BYTE_MIN + getbits(marker, 6), b[1:])
elif marker < POS_2BYTE_MARKER:
return (getbits(marker, 6), b[1:])
elif marker < POS_MULTI_MARKER:
return (POS_1BYTE_MAX + 1 +
((getbits(marker, 5) << 8) | ord(b[1])), b[2:])
else:
sz = getbits(marker, 4)
return (POS_2BYTE_MAX + 1 + get_int(b[1:], sz), b[sz+1:])
# Sanity testing
if __name__ == '__main__':
import random
for big in (100, 10000, 1 << 40, 1 << 64):
for i in xrange(1000):
r = random.randint(-big, big)
print "\rChecking %d" % r,
if unpack_int(pack_int(r))[0] != r:
print "\nFound a problem with %d" % r
break
print
for i in xrange(1000):
r1 = random.randint(-big, big)
r2 = random.randint(-big, big)
print "\rChecking %d, %d" % (r1, r2),
if cmp(r1, r2) != cmp(pack_int(r1), pack_int(r2)):
print "\nFound a problem with %d, %d" % (r1, r2)
break
print