| |
| |
| |
| |
| """Contains PackIndexFile and PackFile implementations""" |
| import zlib |
|
|
| from gitdb.exc import ( |
| BadObject, |
| AmbiguousObjectName, |
| UnsupportedOperation, |
| ParseError |
| ) |
|
|
| from gitdb.util import ( |
| mman, |
| LazyMixin, |
| unpack_from, |
| bin_to_hex, |
| byte_ord, |
| ) |
|
|
| from gitdb.fun import ( |
| create_pack_object_header, |
| pack_object_header_info, |
| is_equal_canonical_sha, |
| type_id_to_type_map, |
| write_object, |
| stream_copy, |
| chunk_size, |
| delta_types, |
| OFS_DELTA, |
| REF_DELTA, |
| msb_size |
| ) |
|
|
| try: |
| from gitdb_speedups._perf import PackIndexFile_sha_to_index |
| except ImportError: |
| pass |
| |
|
|
| from gitdb.base import ( |
| OInfo, |
| OStream, |
| OPackInfo, |
| OPackStream, |
| ODeltaStream, |
| ODeltaPackInfo, |
| ODeltaPackStream, |
| ) |
|
|
| from gitdb.stream import ( |
| DecompressMemMapReader, |
| DeltaApplyReader, |
| Sha1Writer, |
| NullStream, |
| FlexibleSha1Writer |
| ) |
|
|
| from struct import pack |
| from binascii import crc32 |
|
|
| from gitdb.const import NULL_BYTE |
|
|
| import tempfile |
| import array |
| import os |
| import sys |
|
|
| __all__ = ('PackIndexFile', 'PackFile', 'PackEntity') |
|
|
|
|
| |
|
|
| def pack_object_at(cursor, offset, as_stream): |
| """ |
| :return: Tuple(abs_data_offset, PackInfo|PackStream) |
| an object of the correct type according to the type_id of the object. |
| If as_stream is True, the object will contain a stream, allowing the |
| data to be read decompressed. |
| :param data: random accessible data containing all required information |
| :parma offset: offset in to the data at which the object information is located |
| :param as_stream: if True, a stream object will be returned that can read |
| the data, otherwise you receive an info object only""" |
| data = cursor.use_region(offset).buffer() |
| type_id, uncomp_size, data_rela_offset = pack_object_header_info(data) |
| total_rela_offset = None |
| delta_info = None |
|
|
| |
| if type_id == OFS_DELTA: |
| i = data_rela_offset |
| c = byte_ord(data[i]) |
| i += 1 |
| delta_offset = c & 0x7f |
| while c & 0x80: |
| c = byte_ord(data[i]) |
| i += 1 |
| delta_offset += 1 |
| delta_offset = (delta_offset << 7) + (c & 0x7f) |
| |
| delta_info = delta_offset |
| total_rela_offset = i |
| |
| elif type_id == REF_DELTA: |
| total_rela_offset = data_rela_offset + 20 |
| delta_info = data[data_rela_offset:total_rela_offset] |
| |
| else: |
| |
| total_rela_offset = data_rela_offset |
| |
| abs_data_offset = offset + total_rela_offset |
| if as_stream: |
| stream = DecompressMemMapReader(data[total_rela_offset:], False, uncomp_size) |
| if delta_info is None: |
| return abs_data_offset, OPackStream(offset, type_id, uncomp_size, stream) |
| else: |
| return abs_data_offset, ODeltaPackStream(offset, type_id, uncomp_size, delta_info, stream) |
| else: |
| if delta_info is None: |
| return abs_data_offset, OPackInfo(offset, type_id, uncomp_size) |
| else: |
| return abs_data_offset, ODeltaPackInfo(offset, type_id, uncomp_size, delta_info) |
| |
| |
|
|
|
|
| def write_stream_to_pack(read, write, zstream, base_crc=None): |
| """Copy a stream as read from read function, zip it, and write the result. |
| Count the number of written bytes and return it |
| :param base_crc: if not None, the crc will be the base for all compressed data |
| we consecutively write and generate a crc32 from. If None, no crc will be generated |
| :return: tuple(no bytes read, no bytes written, crc32) crc might be 0 if base_crc |
| was false""" |
| br = 0 |
| bw = 0 |
| want_crc = base_crc is not None |
| crc = 0 |
| if want_crc: |
| crc = base_crc |
| |
|
|
| while True: |
| chunk = read(chunk_size) |
| br += len(chunk) |
| compressed = zstream.compress(chunk) |
| bw += len(compressed) |
| write(compressed) |
|
|
| if want_crc: |
| crc = crc32(compressed, crc) |
| |
|
|
| if len(chunk) != chunk_size: |
| break |
| |
|
|
| compressed = zstream.flush() |
| bw += len(compressed) |
| write(compressed) |
| if want_crc: |
| crc = crc32(compressed, crc) |
| |
|
|
| return (br, bw, crc) |
|
|
|
|
| |
|
|
|
|
| class IndexWriter: |
|
|
| """Utility to cache index information, allowing to write all information later |
| in one go to the given stream |
| **Note:** currently only writes v2 indices""" |
| __slots__ = '_objs' |
|
|
| def __init__(self): |
| self._objs = list() |
|
|
| def append(self, binsha, crc, offset): |
| """Append one piece of object information""" |
| self._objs.append((binsha, crc, offset)) |
|
|
| def write(self, pack_sha, write): |
| """Write the index file using the given write method |
| :param pack_sha: binary sha over the whole pack that we index |
| :return: sha1 binary sha over all index file contents""" |
| |
| self._objs.sort(key=lambda o: o[0]) |
|
|
| sha_writer = FlexibleSha1Writer(write) |
| sha_write = sha_writer.write |
| sha_write(PackIndexFile.index_v2_signature) |
| sha_write(pack(">L", PackIndexFile.index_version_default)) |
|
|
| |
| tmplist = list((0,) * 256) |
| for t in self._objs: |
| tmplist[byte_ord(t[0][0])] += 1 |
| |
| for i in range(255): |
| v = tmplist[i] |
| sha_write(pack('>L', v)) |
| tmplist[i + 1] += v |
| |
| sha_write(pack('>L', tmplist[255])) |
|
|
| |
| |
| sha_write(b''.join(t[0] for t in self._objs)) |
|
|
| |
| for t in self._objs: |
| sha_write(pack('>L', t[1] & 0xffffffff)) |
| |
|
|
| tmplist = list() |
| |
| for t in self._objs: |
| ofs = t[2] |
| if ofs > 0x7fffffff: |
| tmplist.append(ofs) |
| ofs = 0x80000000 + len(tmplist) - 1 |
| |
| sha_write(pack('>L', ofs & 0xffffffff)) |
| |
|
|
| |
| for ofs in tmplist: |
| sha_write(pack(">Q", ofs)) |
| |
|
|
| |
| assert(len(pack_sha) == 20) |
| sha_write(pack_sha) |
| sha = sha_writer.sha(as_hex=False) |
| write(sha) |
| return sha |
|
|
|
|
| class PackIndexFile(LazyMixin): |
|
|
| """A pack index provides offsets into the corresponding pack, allowing to find |
| locations for offsets faster.""" |
|
|
| |
| |
| |
| |
|
|
| |
| _sha_list_offset = 8 + 1024 |
| index_v2_signature = b'\xfftOc' |
| index_version_default = 2 |
|
|
| def __init__(self, indexpath): |
| super().__init__() |
| self._indexpath = indexpath |
|
|
| def close(self): |
| mman.force_map_handle_removal_win(self._indexpath) |
| self._cursor = None |
|
|
| def _set_cache_(self, attr): |
| if attr == "_packfile_checksum": |
| self._packfile_checksum = self._cursor.map()[-40:-20] |
| elif attr == "_packfile_checksum": |
| self._packfile_checksum = self._cursor.map()[-20:] |
| elif attr == "_cursor": |
| |
| |
| |
| self._cursor = mman.make_cursor(self._indexpath).use_region() |
| |
| if mman.window_size() > 0 and self._cursor.file_size() > mman.window_size(): |
| raise AssertionError("The index file at %s is too large to fit into a mapped window (%i > %i). This is a limitation of the implementation" % ( |
| self._indexpath, self._cursor.file_size(), mman.window_size())) |
| |
| else: |
| |
| |
|
|
| |
| mmap = self._cursor.map() |
| self._version = (mmap[:4] == self.index_v2_signature and 2) or 1 |
| if self._version == 2: |
| version_id = unpack_from(">L", mmap, 4)[0] |
| assert version_id == self._version, "Unsupported index version: %i" % version_id |
| |
|
|
| |
| |
| for fname in ('entry', 'offset', 'sha', 'crc'): |
| setattr(self, fname, getattr(self, "_%s_v%i" % (fname, self._version))) |
| |
|
|
| |
| |
| self._initialize() |
| |
|
|
| |
|
|
| def _entry_v1(self, i): |
| """:return: tuple(offset, binsha, 0)""" |
| return unpack_from(">L20s", self._cursor.map(), 1024 + i * 24) + (0, ) |
|
|
| def _offset_v1(self, i): |
| """see ``_offset_v2``""" |
| return unpack_from(">L", self._cursor.map(), 1024 + i * 24)[0] |
|
|
| def _sha_v1(self, i): |
| """see ``_sha_v2``""" |
| base = 1024 + (i * 24) + 4 |
| return self._cursor.map()[base:base + 20] |
|
|
| def _crc_v1(self, i): |
| """unsupported""" |
| return 0 |
|
|
| |
|
|
| |
| def _entry_v2(self, i): |
| """:return: tuple(offset, binsha, crc)""" |
| return (self._offset_v2(i), self._sha_v2(i), self._crc_v2(i)) |
|
|
| def _offset_v2(self, i): |
| """:return: 32 or 64 byte offset into pack files. 64 byte offsets will only |
| be returned if the pack is larger than 4 GiB, or 2^32""" |
| offset = unpack_from(">L", self._cursor.map(), self._pack_offset + i * 4)[0] |
|
|
| |
| |
| |
| if offset & 0x80000000: |
| offset = unpack_from(">Q", self._cursor.map(), self._pack_64_offset + (offset & ~0x80000000) * 8)[0] |
| |
|
|
| return offset |
|
|
| def _sha_v2(self, i): |
| """:return: sha at the given index of this file index instance""" |
| base = self._sha_list_offset + i * 20 |
| return self._cursor.map()[base:base + 20] |
|
|
| def _crc_v2(self, i): |
| """:return: 4 bytes crc for the object at index i""" |
| return unpack_from(">L", self._cursor.map(), self._crc_list_offset + i * 4)[0] |
|
|
| |
|
|
| |
|
|
| def _initialize(self): |
| """initialize base data""" |
| self._fanout_table = self._read_fanout((self._version == 2) * 8) |
|
|
| if self._version == 2: |
| self._crc_list_offset = self._sha_list_offset + self.size() * 20 |
| self._pack_offset = self._crc_list_offset + self.size() * 4 |
| self._pack_64_offset = self._pack_offset + self.size() * 4 |
| |
|
|
| def _read_fanout(self, byte_offset): |
| """Generate a fanout table from our data""" |
| d = self._cursor.map() |
| out = list() |
| append = out.append |
| for i in range(256): |
| append(unpack_from('>L', d, byte_offset + i * 4)[0]) |
| |
| return out |
|
|
| |
|
|
| |
| def version(self): |
| return self._version |
|
|
| def size(self): |
| """:return: amount of objects referred to by this index""" |
| return self._fanout_table[255] |
|
|
| def path(self): |
| """:return: path to the packindexfile""" |
| return self._indexpath |
|
|
| def packfile_checksum(self): |
| """:return: 20 byte sha representing the sha1 hash of the pack file""" |
| return self._cursor.map()[-40:-20] |
|
|
| def indexfile_checksum(self): |
| """:return: 20 byte sha representing the sha1 hash of this index file""" |
| return self._cursor.map()[-20:] |
|
|
| def offsets(self): |
| """:return: sequence of all offsets in the order in which they were written |
| |
| **Note:** return value can be random accessed, but may be immmutable""" |
| if self._version == 2: |
| |
| a = array.array('I') |
| a.frombytes(self._cursor.map()[self._pack_offset:self._pack_64_offset]) |
|
|
| |
| if sys.byteorder == 'little': |
| a.byteswap() |
| return a |
| else: |
| return tuple(self.offset(index) for index in range(self.size())) |
| |
|
|
| def sha_to_index(self, sha): |
| """ |
| :return: index usable with the ``offset`` or ``entry`` method, or None |
| if the sha was not found in this pack index |
| :param sha: 20 byte sha to lookup""" |
| first_byte = byte_ord(sha[0]) |
| get_sha = self.sha |
| lo = 0 |
| if first_byte != 0: |
| lo = self._fanout_table[first_byte - 1] |
| hi = self._fanout_table[first_byte] |
|
|
| |
| while lo < hi: |
| mid = (lo + hi) // 2 |
| mid_sha = get_sha(mid) |
| if sha < mid_sha: |
| hi = mid |
| elif sha == mid_sha: |
| return mid |
| else: |
| lo = mid + 1 |
| |
| |
| return None |
|
|
| def partial_sha_to_index(self, partial_bin_sha, canonical_length): |
| """ |
| :return: index as in `sha_to_index` or None if the sha was not found in this |
| index file |
| :param partial_bin_sha: an at least two bytes of a partial binary sha as bytes |
| :param canonical_length: length of the original hexadecimal representation of the |
| given partial binary sha |
| :raise AmbiguousObjectName:""" |
| if len(partial_bin_sha) < 2: |
| raise ValueError("Require at least 2 bytes of partial sha") |
|
|
| assert isinstance(partial_bin_sha, bytes), "partial_bin_sha must be bytes" |
| first_byte = byte_ord(partial_bin_sha[0]) |
|
|
| get_sha = self.sha |
| lo = 0 |
| if first_byte != 0: |
| lo = self._fanout_table[first_byte - 1] |
| hi = self._fanout_table[first_byte] |
|
|
| |
| filled_sha = partial_bin_sha + NULL_BYTE * (20 - len(partial_bin_sha)) |
|
|
| |
| while lo < hi: |
| mid = (lo + hi) // 2 |
| mid_sha = get_sha(mid) |
| if filled_sha < mid_sha: |
| hi = mid |
| elif filled_sha == mid_sha: |
| |
| lo = mid |
| break |
| else: |
| lo = mid + 1 |
| |
| |
|
|
| if lo < self.size(): |
| cur_sha = get_sha(lo) |
| if is_equal_canonical_sha(canonical_length, partial_bin_sha, cur_sha): |
| next_sha = None |
| if lo + 1 < self.size(): |
| next_sha = get_sha(lo + 1) |
| if next_sha and next_sha == cur_sha: |
| raise AmbiguousObjectName(partial_bin_sha) |
| return lo |
| |
| |
| return None |
|
|
| if 'PackIndexFile_sha_to_index' in globals(): |
| |
| |
| def sha_to_index(self, sha): |
| return PackIndexFile_sha_to_index(self, sha) |
| |
|
|
| |
|
|
|
|
| class PackFile(LazyMixin): |
|
|
| """A pack is a file written according to the Version 2 for git packs |
| |
| As we currently use memory maps, it could be assumed that the maximum size of |
| packs therefore is 32 bit on 32 bit systems. On 64 bit systems, this should be |
| fine though. |
| |
| **Note:** at some point, this might be implemented using streams as well, or |
| streams are an alternate path in the case memory maps cannot be created |
| for some reason - one clearly doesn't want to read 10GB at once in that |
| case""" |
|
|
| __slots__ = ('_packpath', '_cursor', '_size', '_version') |
| pack_signature = 0x5041434b |
| pack_version_default = 2 |
|
|
| |
| first_object_offset = 3 * 4 |
| footer_size = 20 |
|
|
| def __init__(self, packpath): |
| self._packpath = packpath |
|
|
| def close(self): |
| mman.force_map_handle_removal_win(self._packpath) |
| self._cursor = None |
|
|
| def _set_cache_(self, attr): |
| |
| self._cursor = mman.make_cursor(self._packpath).use_region() |
|
|
| |
| type_id, self._version, self._size = unpack_from(">LLL", self._cursor.map(), 0) |
|
|
| |
| |
| if type_id != self.pack_signature: |
| raise ParseError("Invalid pack signature: %i" % type_id) |
|
|
| def _iter_objects(self, start_offset, as_stream=True): |
| """Handle the actual iteration of objects within this pack""" |
| c = self._cursor |
| content_size = c.file_size() - self.footer_size |
| cur_offset = start_offset or self.first_object_offset |
|
|
| null = NullStream() |
| while cur_offset < content_size: |
| data_offset, ostream = pack_object_at(c, cur_offset, True) |
| |
| |
|
|
| stream_copy(ostream.read, null.write, ostream.size, chunk_size) |
| assert ostream.stream._br == ostream.size |
| cur_offset += (data_offset - ostream.pack_offset) + ostream.stream.compressed_bytes_read() |
|
|
| |
| |
| |
| if as_stream: |
| ostream.stream.seek(0) |
| yield ostream |
| |
|
|
| |
|
|
| def size(self): |
| """:return: The amount of objects stored in this pack""" |
| return self._size |
|
|
| def version(self): |
| """:return: the version of this pack""" |
| return self._version |
|
|
| def data(self): |
| """ |
| :return: read-only data of this pack. It provides random access and usually |
| is a memory map. |
| :note: This method is unsafe as it returns a window into a file which might be larger than than the actual window size""" |
| |
| return self._cursor.use_region().map() |
|
|
| def checksum(self): |
| """:return: 20 byte sha1 hash on all object sha's contained in this file""" |
| return self._cursor.use_region(self._cursor.file_size() - 20).buffer()[:] |
|
|
| def path(self): |
| """:return: path to the packfile""" |
| return self._packpath |
| |
|
|
| |
|
|
| def collect_streams(self, offset): |
| """ |
| :return: list of pack streams which are required to build the object |
| at the given offset. The first entry of the list is the object at offset, |
| the last one is either a full object, or a REF_Delta stream. The latter |
| type needs its reference object to be locked up in an ODB to form a valid |
| delta chain. |
| If the object at offset is no delta, the size of the list is 1. |
| :param offset: specifies the first byte of the object within this pack""" |
| out = list() |
| c = self._cursor |
| while True: |
| ostream = pack_object_at(c, offset, True)[1] |
| out.append(ostream) |
| if ostream.type_id == OFS_DELTA: |
| offset = ostream.pack_offset - ostream.delta_info |
| else: |
| |
| |
| |
| break |
| |
| |
| return out |
|
|
| |
|
|
| |
|
|
| def info(self, offset): |
| """Retrieve information about the object at the given file-absolute offset |
| |
| :param offset: byte offset |
| :return: OPackInfo instance, the actual type differs depending on the type_id attribute""" |
| return pack_object_at(self._cursor, offset or self.first_object_offset, False)[1] |
|
|
| def stream(self, offset): |
| """Retrieve an object at the given file-relative offset as stream along with its information |
| |
| :param offset: byte offset |
| :return: OPackStream instance, the actual type differs depending on the type_id attribute""" |
| return pack_object_at(self._cursor, offset or self.first_object_offset, True)[1] |
|
|
| def stream_iter(self, start_offset=0): |
| """ |
| :return: iterator yielding OPackStream compatible instances, allowing |
| to access the data in the pack directly. |
| :param start_offset: offset to the first object to iterate. If 0, iteration |
| starts at the very first object in the pack. |
| |
| **Note:** Iterating a pack directly is costly as the datastream has to be decompressed |
| to determine the bounds between the objects""" |
| return self._iter_objects(start_offset, as_stream=True) |
|
|
| |
|
|
|
|
| class PackEntity(LazyMixin): |
|
|
| """Combines the PackIndexFile and the PackFile into one, allowing the |
| actual objects to be resolved and iterated""" |
|
|
| __slots__ = ('_index', |
| '_pack', |
| '_offset_map' |
| ) |
|
|
| IndexFileCls = PackIndexFile |
| PackFileCls = PackFile |
|
|
| def __init__(self, pack_or_index_path): |
| """Initialize ourselves with the path to the respective pack or index file""" |
| basename, ext = os.path.splitext(pack_or_index_path) |
| self._index = self.IndexFileCls("%s.idx" % basename) |
| self._pack = self.PackFileCls("%s.pack" % basename) |
|
|
| def close(self): |
| self._index.close() |
| self._pack.close() |
|
|
| def _set_cache_(self, attr): |
| |
| |
| |
| |
| offsets_sorted = sorted(self._index.offsets()) |
| last_offset = len(self._pack.data()) - self._pack.footer_size |
| assert offsets_sorted, "Cannot handle empty indices" |
|
|
| offset_map = None |
| if len(offsets_sorted) == 1: |
| offset_map = {offsets_sorted[0]: last_offset} |
| else: |
| iter_offsets = iter(offsets_sorted) |
| iter_offsets_plus_one = iter(offsets_sorted) |
| next(iter_offsets_plus_one) |
| consecutive = zip(iter_offsets, iter_offsets_plus_one) |
|
|
| offset_map = dict(consecutive) |
|
|
| |
| offset_map[offsets_sorted[-1]] = last_offset |
| |
| self._offset_map = offset_map |
|
|
| def _sha_to_index(self, sha): |
| """:return: index for the given sha, or raise""" |
| index = self._index.sha_to_index(sha) |
| if index is None: |
| raise BadObject(sha) |
| return index |
|
|
| def _iter_objects(self, as_stream): |
| """Iterate over all objects in our index and yield their OInfo or OStream instences""" |
| _sha = self._index.sha |
| _object = self._object |
| for index in range(self._index.size()): |
| yield _object(_sha(index), as_stream, index) |
| |
|
|
| def _object(self, sha, as_stream, index=-1): |
| """:return: OInfo or OStream object providing information about the given sha |
| :param index: if not -1, its assumed to be the sha's index in the IndexFile""" |
| |
| if index < 0: |
| index = self._sha_to_index(sha) |
| if sha is None: |
| sha = self._index.sha(index) |
| |
| offset = self._index.offset(index) |
| type_id, uncomp_size, data_rela_offset = pack_object_header_info(self._pack._cursor.use_region(offset).buffer()) |
| if as_stream: |
| if type_id not in delta_types: |
| packstream = self._pack.stream(offset) |
| return OStream(sha, packstream.type, packstream.size, packstream.stream) |
| |
|
|
| |
| |
| |
| streams = self.collect_streams_at_offset(offset) |
| dstream = DeltaApplyReader.new(streams) |
|
|
| return ODeltaStream(sha, dstream.type, None, dstream) |
| else: |
| if type_id not in delta_types: |
| return OInfo(sha, type_id_to_type_map[type_id], uncomp_size) |
| |
|
|
| |
| |
| streams = self.collect_streams_at_offset(offset) |
| buf = streams[0].read(512) |
| offset, src_size = msb_size(buf) |
| offset, target_size = msb_size(buf, offset) |
|
|
| |
| if streams[-1].type_id in delta_types: |
| raise BadObject(sha, "Could not resolve delta object") |
| return OInfo(sha, streams[-1].type, target_size) |
| |
|
|
| |
|
|
| def info(self, sha): |
| """Retrieve information about the object identified by the given sha |
| |
| :param sha: 20 byte sha1 |
| :raise BadObject: |
| :return: OInfo instance, with 20 byte sha""" |
| return self._object(sha, False) |
|
|
| def stream(self, sha): |
| """Retrieve an object stream along with its information as identified by the given sha |
| |
| :param sha: 20 byte sha1 |
| :raise BadObject: |
| :return: OStream instance, with 20 byte sha""" |
| return self._object(sha, True) |
|
|
| def info_at_index(self, index): |
| """As ``info``, but uses a PackIndexFile compatible index to refer to the object""" |
| return self._object(None, False, index) |
|
|
| def stream_at_index(self, index): |
| """As ``stream``, but uses a PackIndexFile compatible index to refer to the |
| object""" |
| return self._object(None, True, index) |
|
|
| |
|
|
| |
|
|
| def pack(self): |
| """:return: the underlying pack file instance""" |
| return self._pack |
|
|
| def index(self): |
| """:return: the underlying pack index file instance""" |
| return self._index |
|
|
| def is_valid_stream(self, sha, use_crc=False): |
| """ |
| Verify that the stream at the given sha is valid. |
| |
| :param use_crc: if True, the index' crc is run over the compressed stream of |
| the object, which is much faster than checking the sha1. It is also |
| more prone to unnoticed corruption or manipulation. |
| :param sha: 20 byte sha1 of the object whose stream to verify |
| whether the compressed stream of the object is valid. If it is |
| a delta, this only verifies that the delta's data is valid, not the |
| data of the actual undeltified object, as it depends on more than |
| just this stream. |
| If False, the object will be decompressed and the sha generated. It must |
| match the given sha |
| |
| :return: True if the stream is valid |
| :raise UnsupportedOperation: If the index is version 1 only |
| :raise BadObject: sha was not found""" |
| if use_crc: |
| if self._index.version() < 2: |
| raise UnsupportedOperation("Version 1 indices do not contain crc's, verify by sha instead") |
| |
|
|
| index = self._sha_to_index(sha) |
| offset = self._index.offset(index) |
| next_offset = self._offset_map[offset] |
| crc_value = self._index.crc(index) |
|
|
| |
| |
| crc_update = zlib.crc32 |
| pack_data = self._pack.data() |
| cur_pos = offset |
| this_crc_value = 0 |
| while cur_pos < next_offset: |
| rbound = min(cur_pos + chunk_size, next_offset) |
| size = rbound - cur_pos |
| this_crc_value = crc_update(pack_data[cur_pos:cur_pos + size], this_crc_value) |
| cur_pos += size |
| |
|
|
| |
| |
| return (this_crc_value & 0xffffffff) == crc_value |
| else: |
| shawriter = Sha1Writer() |
| stream = self._object(sha, as_stream=True) |
| |
| write_object(stream.type, stream.size, stream.read, shawriter.write) |
|
|
| assert shawriter.sha(as_hex=False) == sha |
| return shawriter.sha(as_hex=False) == sha |
| |
| return True |
|
|
| def info_iter(self): |
| """ |
| :return: Iterator over all objects in this pack. The iterator yields |
| OInfo instances""" |
| return self._iter_objects(as_stream=False) |
|
|
| def stream_iter(self): |
| """ |
| :return: iterator over all objects in this pack. The iterator yields |
| OStream instances""" |
| return self._iter_objects(as_stream=True) |
|
|
| def collect_streams_at_offset(self, offset): |
| """ |
| As the version in the PackFile, but can resolve REF deltas within this pack |
| For more info, see ``collect_streams`` |
| |
| :param offset: offset into the pack file at which the object can be found""" |
| streams = self._pack.collect_streams(offset) |
|
|
| |
| |
| if streams[-1].type_id == REF_DELTA: |
| stream = streams[-1] |
| while stream.type_id in delta_types: |
| if stream.type_id == REF_DELTA: |
| |
| if isinstance(stream.delta_info, memoryview): |
| sindex = self._index.sha_to_index(stream.delta_info.tobytes()) |
| else: |
| sindex = self._index.sha_to_index(stream.delta_info) |
| if sindex is None: |
| break |
| stream = self._pack.stream(self._index.offset(sindex)) |
| streams.append(stream) |
| else: |
| |
| |
| |
| stream = self._pack.stream(stream.delta_info) |
| streams.append(stream) |
| |
| |
| |
|
|
| return streams |
|
|
| def collect_streams(self, sha): |
| """ |
| As ``PackFile.collect_streams``, but takes a sha instead of an offset. |
| Additionally, ref_delta streams will be resolved within this pack. |
| If this is not possible, the stream will be left alone, hence it is adivsed |
| to check for unresolved ref-deltas and resolve them before attempting to |
| construct a delta stream. |
| |
| :param sha: 20 byte sha1 specifying the object whose related streams you want to collect |
| :return: list of streams, first being the actual object delta, the last being |
| a possibly unresolved base object. |
| :raise BadObject:""" |
| return self.collect_streams_at_offset(self._index.offset(self._sha_to_index(sha))) |
|
|
| @classmethod |
| def write_pack(cls, object_iter, pack_write, index_write=None, |
| object_count=None, zlib_compression=zlib.Z_BEST_SPEED): |
| """ |
| Create a new pack by putting all objects obtained by the object_iterator |
| into a pack which is written using the pack_write method. |
| The respective index is produced as well if index_write is not Non. |
| |
| :param object_iter: iterator yielding odb output objects |
| :param pack_write: function to receive strings to write into the pack stream |
| :param indx_write: if not None, the function writes the index file corresponding |
| to the pack. |
| :param object_count: if you can provide the amount of objects in your iteration, |
| this would be the place to put it. Otherwise we have to pre-iterate and store |
| all items into a list to get the number, which uses more memory than necessary. |
| :param zlib_compression: the zlib compression level to use |
| :return: tuple(pack_sha, index_binsha) binary sha over all the contents of the pack |
| and over all contents of the index. If index_write was None, index_binsha will be None |
| |
| **Note:** The destination of the write functions is up to the user. It could |
| be a socket, or a file for instance |
| |
| **Note:** writes only undeltified objects""" |
| objs = object_iter |
| if not object_count: |
| if not isinstance(object_iter, (tuple, list)): |
| objs = list(object_iter) |
| |
| object_count = len(objs) |
| |
|
|
| pack_writer = FlexibleSha1Writer(pack_write) |
| pwrite = pack_writer.write |
| ofs = 0 |
| index = None |
| wants_index = index_write is not None |
|
|
| |
| pwrite(pack('>LLL', PackFile.pack_signature, PackFile.pack_version_default, object_count)) |
| ofs += 12 |
|
|
| if wants_index: |
| index = IndexWriter() |
| |
|
|
| actual_count = 0 |
| for obj in objs: |
| actual_count += 1 |
| crc = 0 |
|
|
| |
| hdr = create_pack_object_header(obj.type_id, obj.size) |
| if index_write: |
| crc = crc32(hdr) |
| else: |
| crc = None |
| |
| pwrite(hdr) |
|
|
| |
| zstream = zlib.compressobj(zlib_compression) |
| ostream = obj.stream |
| br, bw, crc = write_stream_to_pack(ostream.read, pwrite, zstream, base_crc=crc) |
| assert(br == obj.size) |
| if wants_index: |
| index.append(obj.binsha, crc, ofs) |
| |
|
|
| ofs += len(hdr) + bw |
| if actual_count == object_count: |
| break |
| |
| |
|
|
| if actual_count != object_count: |
| raise ValueError( |
| "Expected to write %i objects into pack, but received only %i from iterators" % (object_count, actual_count)) |
| |
|
|
| |
| pack_sha = pack_writer.sha(as_hex=False) |
| assert len(pack_sha) == 20 |
| pack_write(pack_sha) |
| ofs += len(pack_sha) |
|
|
| index_sha = None |
| if wants_index: |
| index_sha = index.write(pack_sha, index_write) |
| |
|
|
| return pack_sha, index_sha |
|
|
| @classmethod |
| def create(cls, object_iter, base_dir, object_count=None, zlib_compression=zlib.Z_BEST_SPEED): |
| """Create a new on-disk entity comprised of a properly named pack file and a properly named |
| and corresponding index file. The pack contains all OStream objects contained in object iter. |
| :param base_dir: directory which is to contain the files |
| :return: PackEntity instance initialized with the new pack |
| |
| **Note:** for more information on the other parameters see the write_pack method""" |
| pack_fd, pack_path = tempfile.mkstemp('', 'pack', base_dir) |
| index_fd, index_path = tempfile.mkstemp('', 'index', base_dir) |
| pack_write = lambda d: os.write(pack_fd, d) |
| index_write = lambda d: os.write(index_fd, d) |
|
|
| pack_binsha, index_binsha = cls.write_pack(object_iter, pack_write, index_write, object_count, zlib_compression) |
| os.close(pack_fd) |
| os.close(index_fd) |
|
|
| fmt = "pack-%s.%s" |
| new_pack_path = os.path.join(base_dir, fmt % (bin_to_hex(pack_binsha), 'pack')) |
| new_index_path = os.path.join(base_dir, fmt % (bin_to_hex(pack_binsha), 'idx')) |
| os.rename(pack_path, new_pack_path) |
| os.rename(index_path, new_index_path) |
|
|
| return cls(new_pack_path) |
|
|
| |
|
|