| |
| |
| |
| |
| """Contains basic c-functions which usually contain performance critical code |
| Keeping this code separate from the beginning makes it easier to out-source |
| it into c later, if required""" |
|
|
| import zlib |
| from gitdb.util import byte_ord |
| decompressobj = zlib.decompressobj |
|
|
| import mmap |
| from itertools import islice |
| from functools import reduce |
|
|
| from gitdb.const import NULL_BYTE, BYTE_SPACE |
| from gitdb.utils.encoding import force_text |
| from gitdb.typ import ( |
| str_blob_type, |
| str_commit_type, |
| str_tree_type, |
| str_tag_type, |
| ) |
|
|
| from io import StringIO |
|
|
| |
| OFS_DELTA = 6 |
| REF_DELTA = 7 |
| delta_types = (OFS_DELTA, REF_DELTA) |
|
|
| type_id_to_type_map = { |
| 0: b'', |
| 1: str_commit_type, |
| 2: str_tree_type, |
| 3: str_blob_type, |
| 4: str_tag_type, |
| 5: b'', |
| OFS_DELTA: "OFS_DELTA", |
| REF_DELTA: "REF_DELTA" |
| } |
|
|
| type_to_type_id_map = { |
| str_commit_type: 1, |
| str_tree_type: 2, |
| str_blob_type: 3, |
| str_tag_type: 4, |
| "OFS_DELTA": OFS_DELTA, |
| "REF_DELTA": REF_DELTA, |
| } |
|
|
| |
| chunk_size = 1000 * mmap.PAGESIZE |
|
|
| __all__ = ('is_loose_object', 'loose_object_header_info', 'msb_size', 'pack_object_header_info', |
| 'write_object', 'loose_object_header', 'stream_copy', 'apply_delta_data', |
| 'is_equal_canonical_sha', 'connect_deltas', 'DeltaChunkList', 'create_pack_object_header') |
|
|
|
|
| |
|
|
| def _set_delta_rbound(d, size): |
| """Truncate the given delta to the given size |
| :param size: size relative to our target offset, may not be 0, must be smaller or equal |
| to our size |
| :return: d""" |
| d.ts = size |
|
|
| |
| |
| return d |
|
|
|
|
| def _move_delta_lbound(d, bytes): |
| """Move the delta by the given amount of bytes, reducing its size so that its |
| right bound stays static |
| :param bytes: amount of bytes to move, must be smaller than delta size |
| :return: d""" |
| if bytes == 0: |
| return |
|
|
| d.to += bytes |
| d.so += bytes |
| d.ts -= bytes |
| if d.data is not None: |
| d.data = d.data[bytes:] |
| |
|
|
| return d |
|
|
|
|
| def delta_duplicate(src): |
| return DeltaChunk(src.to, src.ts, src.so, src.data) |
|
|
|
|
| def delta_chunk_apply(dc, bbuf, write): |
| """Apply own data to the target buffer |
| :param bbuf: buffer providing source bytes for copy operations |
| :param write: write method to call with data to write""" |
| if dc.data is None: |
| |
| write(bbuf[dc.so:dc.so + dc.ts]) |
| else: |
| |
| |
| |
| if dc.ts < len(dc.data): |
| write(dc.data[:dc.ts]) |
| else: |
| write(dc.data) |
| |
| |
|
|
|
|
| class DeltaChunk: |
|
|
| """Represents a piece of a delta, it can either add new data, or copy existing |
| one from a source buffer""" |
| __slots__ = ( |
| 'to', |
| 'ts', |
| 'so', |
| 'data', |
| |
| ) |
|
|
| def __init__(self, to, ts, so, data): |
| self.to = to |
| self.ts = ts |
| self.so = so |
| self.data = data |
|
|
| def __repr__(self): |
| return "DeltaChunk(%i, %i, %s, %s)" % (self.to, self.ts, self.so, self.data or "") |
|
|
| |
|
|
| def rbound(self): |
| return self.to + self.ts |
|
|
| def has_data(self): |
| """:return: True if the instance has data to add to the target stream""" |
| return self.data is not None |
|
|
| |
|
|
|
|
| def _closest_index(dcl, absofs): |
| """:return: index at which the given absofs should be inserted. The index points |
| to the DeltaChunk with a target buffer absofs that equals or is greater than |
| absofs. |
| **Note:** global method for performance only, it belongs to DeltaChunkList""" |
| lo = 0 |
| hi = len(dcl) |
| while lo < hi: |
| mid = (lo + hi) / 2 |
| dc = dcl[mid] |
| if dc.to > absofs: |
| hi = mid |
| elif dc.rbound() > absofs or dc.to == absofs: |
| return mid |
| else: |
| lo = mid + 1 |
| |
| |
| return len(dcl) - 1 |
|
|
|
|
| def delta_list_apply(dcl, bbuf, write): |
| """Apply the chain's changes and write the final result using the passed |
| write function. |
| :param bbuf: base buffer containing the base of all deltas contained in this |
| list. It will only be used if the chunk in question does not have a base |
| chain. |
| :param write: function taking a string of bytes to write to the output""" |
| for dc in dcl: |
| delta_chunk_apply(dc, bbuf, write) |
| |
|
|
|
|
| def delta_list_slice(dcl, absofs, size, ndcl): |
| """:return: Subsection of this list at the given absolute offset, with the given |
| size in bytes. |
| :return: None""" |
| cdi = _closest_index(dcl, absofs) |
| cd = dcl[cdi] |
| slen = len(dcl) |
| lappend = ndcl.append |
|
|
| if cd.to != absofs: |
| tcd = DeltaChunk(cd.to, cd.ts, cd.so, cd.data) |
| _move_delta_lbound(tcd, absofs - cd.to) |
| tcd.ts = min(tcd.ts, size) |
| lappend(tcd) |
| size -= tcd.ts |
| cdi += 1 |
| |
|
|
| while cdi < slen and size: |
| |
| cd = dcl[cdi] |
| if cd.ts <= size: |
| lappend(DeltaChunk(cd.to, cd.ts, cd.so, cd.data)) |
| size -= cd.ts |
| else: |
| tcd = DeltaChunk(cd.to, cd.ts, cd.so, cd.data) |
| tcd.ts = size |
| lappend(tcd) |
| size -= tcd.ts |
| break |
| |
| cdi += 1 |
| |
|
|
|
|
| class DeltaChunkList(list): |
|
|
| """List with special functionality to deal with DeltaChunks. |
| There are two types of lists we represent. The one was created bottom-up, working |
| towards the latest delta, the other kind was created top-down, working from the |
| latest delta down to the earliest ancestor. This attribute is queryable |
| after all processing with is_reversed.""" |
|
|
| __slots__ = tuple() |
|
|
| def rbound(self): |
| """:return: rightmost extend in bytes, absolute""" |
| if len(self) == 0: |
| return 0 |
| return self[-1].rbound() |
|
|
| def lbound(self): |
| """:return: leftmost byte at which this chunklist starts""" |
| if len(self) == 0: |
| return 0 |
| return self[0].to |
|
|
| def size(self): |
| """:return: size of bytes as measured by our delta chunks""" |
| return self.rbound() - self.lbound() |
|
|
| def apply(self, bbuf, write): |
| """Only used by public clients, internally we only use the global routines |
| for performance""" |
| return delta_list_apply(self, bbuf, write) |
|
|
| def compress(self): |
| """Alter the list to reduce the amount of nodes. Currently we concatenate |
| add-chunks |
| :return: self""" |
| slen = len(self) |
| if slen < 2: |
| return self |
| i = 0 |
|
|
| first_data_index = None |
| while i < slen: |
| dc = self[i] |
| i += 1 |
| if dc.data is None: |
| if first_data_index is not None and i - 2 - first_data_index > 1: |
| |
| nd = StringIO() |
| so = self[first_data_index].to |
| for x in range(first_data_index, i - 1): |
| xdc = self[x] |
| nd.write(xdc.data[:xdc.ts]) |
| |
|
|
| del(self[first_data_index:i - 1]) |
| buf = nd.getvalue() |
| self.insert(first_data_index, DeltaChunk(so, len(buf), 0, buf)) |
|
|
| slen = len(self) |
| i = first_data_index + 1 |
|
|
| |
| first_data_index = None |
| continue |
| |
|
|
| if first_data_index is None: |
| first_data_index = i - 1 |
| |
|
|
| |
| |
| return self |
|
|
| def check_integrity(self, target_size=-1): |
| """Verify the list has non-overlapping chunks only, and the total size matches |
| target_size |
| :param target_size: if not -1, the total size of the chain must be target_size |
| :raise AssertionError: if the size doesn't match""" |
| if target_size > -1: |
| assert self[-1].rbound() == target_size |
| assert reduce(lambda x, y: x + y, (d.ts for d in self), 0) == target_size |
| |
|
|
| if len(self) < 2: |
| return |
|
|
| |
| for dc in self: |
| assert dc.ts > 0 |
| if dc.has_data(): |
| assert len(dc.data) >= dc.ts |
| |
|
|
| left = islice(self, 0, len(self) - 1) |
| right = iter(self) |
| right.next() |
| |
| |
| for lft, rgt in zip(left, right): |
| assert lft.rbound() == rgt.to |
| assert lft.to + lft.ts == rgt.to |
| |
|
|
|
|
| class TopdownDeltaChunkList(DeltaChunkList): |
|
|
| """Represents a list which is generated by feeding its ancestor streams one by |
| one""" |
| __slots__ = tuple() |
|
|
| def connect_with_next_base(self, bdcl): |
| """Connect this chain with the next level of our base delta chunklist. |
| The goal in this game is to mark as many of our chunks rigid, hence they |
| cannot be changed by any of the upcoming bases anymore. Once all our |
| chunks are marked like that, we can stop all processing |
| :param bdcl: data chunk list being one of our bases. They must be fed in |
| consecutively and in order, towards the earliest ancestor delta |
| :return: True if processing was done. Use it to abort processing of |
| remaining streams if False is returned""" |
| nfc = 0 |
| dci = 0 |
| slen = len(self) |
| ccl = list() |
| while dci < slen: |
| dc = self[dci] |
| dci += 1 |
|
|
| |
| if dc.data is not None: |
| nfc += 1 |
| continue |
| |
|
|
| |
| |
| |
| |
| |
| |
| del(ccl[:]) |
| delta_list_slice(bdcl, dc.so, dc.ts, ccl) |
|
|
| |
| ofs = dc.to - dc.so |
| for cdc in ccl: |
| cdc.to += ofs |
| |
|
|
| if len(ccl) == 1: |
| self[dci - 1] = ccl[0] |
| else: |
| |
| |
| |
| |
| post_dci = self[dci:] |
| del(self[dci - 1:]) |
| self.extend(ccl) |
| self.extend(post_dci) |
|
|
| slen = len(self) |
| dci += len(ccl) - 1 |
|
|
| |
| |
|
|
| if nfc == slen: |
| return False |
| |
| return True |
|
|
|
|
| |
|
|
| |
|
|
| def is_loose_object(m): |
| """ |
| :return: True the file contained in memory map m appears to be a loose object. |
| Only the first two bytes are needed""" |
| b0, b1 = map(ord, m[:2]) |
| word = (b0 << 8) + b1 |
| return b0 == 0x78 and (word % 31) == 0 |
|
|
|
|
| def loose_object_header_info(m): |
| """ |
| :return: tuple(type_string, uncompressed_size_in_bytes) the type string of the |
| object as well as its uncompressed size in bytes. |
| :param m: memory map from which to read the compressed object data""" |
| decompress_size = 8192 |
| hdr = decompressobj().decompress(m, decompress_size) |
| type_name, size = hdr[:hdr.find(NULL_BYTE)].split(BYTE_SPACE) |
|
|
| return type_name, int(size) |
|
|
|
|
| def pack_object_header_info(data): |
| """ |
| :return: tuple(type_id, uncompressed_size_in_bytes, byte_offset) |
| The type_id should be interpreted according to the ``type_id_to_type_map`` map |
| The byte-offset specifies the start of the actual zlib compressed datastream |
| :param m: random-access memory, like a string or memory map""" |
| c = byte_ord(data[0]) |
| i = 1 |
| type_id = (c >> 4) & 7 |
| size = c & 15 |
| s = 4 |
| while c & 0x80: |
| c = byte_ord(data[i]) |
| i += 1 |
| size += (c & 0x7f) << s |
| s += 7 |
| |
| |
| return (type_id, size, i) |
|
|
|
|
| def create_pack_object_header(obj_type, obj_size): |
| """ |
| :return: string defining the pack header comprised of the object type |
| and its incompressed size in bytes |
| |
| :param obj_type: pack type_id of the object |
| :param obj_size: uncompressed size in bytes of the following object stream""" |
| c = 0 |
| hdr = bytearray() |
|
|
| c = (obj_type << 4) | (obj_size & 0xf) |
| obj_size >>= 4 |
| while obj_size: |
| hdr.append(c | 0x80) |
| c = obj_size & 0x7f |
| obj_size >>= 7 |
| |
| hdr.append(c) |
| |
| return hdr |
|
|
|
|
| def msb_size(data, offset=0): |
| """ |
| :return: tuple(read_bytes, size) read the msb size from the given random |
| access data starting at the given byte offset""" |
| size = 0 |
| i = 0 |
| l = len(data) |
| hit_msb = False |
| while i < l: |
| c = data[i + offset] |
| size |= (c & 0x7f) << i * 7 |
| i += 1 |
| if not c & 0x80: |
| hit_msb = True |
| break |
| |
| |
| |
| if not hit_msb: |
| raise AssertionError("Could not find terminating MSB byte in data stream") |
| return i + offset, size |
|
|
|
|
| def loose_object_header(type, size): |
| """ |
| :return: bytes representing the loose object header, which is immediately |
| followed by the content stream of size 'size'""" |
| return ('%s %i\0' % (force_text(type), size)).encode('ascii') |
|
|
|
|
| def write_object(type, size, read, write, chunk_size=chunk_size): |
| """ |
| Write the object as identified by type, size and source_stream into the |
| target_stream |
| |
| :param type: type string of the object |
| :param size: amount of bytes to write from source_stream |
| :param read: read method of a stream providing the content data |
| :param write: write method of the output stream |
| :param close_target_stream: if True, the target stream will be closed when |
| the routine exits, even if an error is thrown |
| :return: The actual amount of bytes written to stream, which includes the header and a trailing newline""" |
| tbw = 0 |
|
|
| |
| tbw += write(loose_object_header(type, size)) |
| tbw += stream_copy(read, write, size, chunk_size) |
|
|
| return tbw |
|
|
|
|
| def stream_copy(read, write, size, chunk_size): |
| """ |
| Copy a stream up to size bytes using the provided read and write methods, |
| in chunks of chunk_size |
| |
| **Note:** its much like stream_copy utility, but operates just using methods""" |
| dbw = 0 |
|
|
| |
| while True: |
| cs = min(chunk_size, size - dbw) |
| |
| |
| |
| |
| |
| data = read(cs) |
| data_len = len(data) |
| dbw += data_len |
| write(data) |
| if data_len < cs or dbw == size: |
| break |
| |
| |
| return dbw |
|
|
|
|
| def connect_deltas(dstreams): |
| """ |
| Read the condensed delta chunk information from dstream and merge its information |
| into a list of existing delta chunks |
| |
| :param dstreams: iterable of delta stream objects, the delta to be applied last |
| comes first, then all its ancestors in order |
| :return: DeltaChunkList, containing all operations to apply""" |
| tdcl = None |
|
|
| dcl = tdcl = TopdownDeltaChunkList() |
| for dsi, ds in enumerate(dstreams): |
| |
| db = ds.read() |
| delta_buf_size = ds.size |
|
|
| |
| i, base_size = msb_size(db) |
| i, target_size = msb_size(db, i) |
|
|
| |
| tbw = 0 |
| while i < delta_buf_size: |
| c = ord(db[i]) |
| i += 1 |
| if c & 0x80: |
| cp_off, cp_size = 0, 0 |
| if (c & 0x01): |
| cp_off = ord(db[i]) |
| i += 1 |
| if (c & 0x02): |
| cp_off |= (ord(db[i]) << 8) |
| i += 1 |
| if (c & 0x04): |
| cp_off |= (ord(db[i]) << 16) |
| i += 1 |
| if (c & 0x08): |
| cp_off |= (ord(db[i]) << 24) |
| i += 1 |
| if (c & 0x10): |
| cp_size = ord(db[i]) |
| i += 1 |
| if (c & 0x20): |
| cp_size |= (ord(db[i]) << 8) |
| i += 1 |
| if (c & 0x40): |
| cp_size |= (ord(db[i]) << 16) |
| i += 1 |
|
|
| if not cp_size: |
| cp_size = 0x10000 |
|
|
| rbound = cp_off + cp_size |
| if (rbound < cp_size or |
| rbound > base_size): |
| break |
|
|
| dcl.append(DeltaChunk(tbw, cp_size, cp_off, None)) |
| tbw += cp_size |
| elif c: |
| |
| |
| dcl.append(DeltaChunk(tbw, c, 0, db[i:i + c])) |
| i += c |
| tbw += c |
| else: |
| raise ValueError("unexpected delta opcode 0") |
| |
| |
|
|
| dcl.compress() |
|
|
| |
| if dsi > 0: |
| if not tdcl.connect_with_next_base(dcl): |
| break |
| |
|
|
| |
| dcl = DeltaChunkList() |
| |
|
|
| return tdcl |
|
|
|
|
| def apply_delta_data(src_buf, src_buf_size, delta_buf, delta_buf_size, write): |
| """ |
| Apply data from a delta buffer using a source buffer to the target file |
| |
| :param src_buf: random access data from which the delta was created |
| :param src_buf_size: size of the source buffer in bytes |
| :param delta_buf_size: size for the delta buffer in bytes |
| :param delta_buf: random access delta data |
| :param write: write method taking a chunk of bytes |
| |
| **Note:** transcribed to python from the similar routine in patch-delta.c""" |
| i = 0 |
| db = delta_buf |
| while i < delta_buf_size: |
| c = db[i] |
| i += 1 |
| if c & 0x80: |
| cp_off, cp_size = 0, 0 |
| if (c & 0x01): |
| cp_off = db[i] |
| i += 1 |
| if (c & 0x02): |
| cp_off |= (db[i] << 8) |
| i += 1 |
| if (c & 0x04): |
| cp_off |= (db[i] << 16) |
| i += 1 |
| if (c & 0x08): |
| cp_off |= (db[i] << 24) |
| i += 1 |
| if (c & 0x10): |
| cp_size = db[i] |
| i += 1 |
| if (c & 0x20): |
| cp_size |= (db[i] << 8) |
| i += 1 |
| if (c & 0x40): |
| cp_size |= (db[i] << 16) |
| i += 1 |
|
|
| if not cp_size: |
| cp_size = 0x10000 |
|
|
| rbound = cp_off + cp_size |
| if (rbound < cp_size or |
| rbound > src_buf_size): |
| break |
| write(src_buf[cp_off:cp_off + cp_size]) |
| elif c: |
| write(db[i:i + c]) |
| i += c |
| else: |
| raise ValueError("unexpected delta opcode 0") |
| |
| |
|
|
| |
| assert i == delta_buf_size, "delta replay has gone wild" |
|
|
|
|
| def is_equal_canonical_sha(canonical_length, match, sha1): |
| """ |
| :return: True if the given lhs and rhs 20 byte binary shas |
| The comparison will take the canonical_length of the match sha into account, |
| hence the comparison will only use the last 4 bytes for uneven canonical representations |
| :param match: less than 20 byte sha |
| :param sha1: 20 byte sha""" |
| binary_length = canonical_length // 2 |
| if match[:binary_length] != sha1[:binary_length]: |
| return False |
|
|
| if canonical_length - binary_length and \ |
| (byte_ord(match[-1]) ^ byte_ord(sha1[len(match) - 1])) & 0xf0: |
| return False |
| |
| return True |
|
|
| |
|
|
|
|
| try: |
| from gitdb_speedups._perf import connect_deltas |
| except ImportError: |
| pass |
|
|