Source code for detools.create

import io
import time
import logging
import tempfile
import mmap
import lzma
from bz2 import BZ2Compressor
from io import BytesIO
import struct
import bitstruct
from humanfriendly import format_timespan
from humanfriendly import format_size
from .errors import Error
from .compression.crle import CrleCompressor
from .compression.none import NoneCompressor
from .compression.heatshrink import HeatshrinkCompressor
from .compression.zstd import ZstdCompressor
from .compression.lz4 import Lz4Compressor
from .common import PATCH_TYPE_SEQUENTIAL
from .common import PATCH_TYPE_IN_PLACE
from .common import PATCH_TYPE_HDIFFPATCH
from .common import DATA_FORMATS
from .common import PATCH_TYPES
from .common import format_bad_compression_string
from .common import compression_string_to_number
from .common import div_ceil
from .common import file_size
from .common import file_read
from .common import pack_size
from .common import DataSegment
from .common import unpack_size_bytes
from .data_format import encode as data_format_encode
from .suffix_array import sais
from .suffix_array import divsufsort
from . import bsdiff
from . import hdiffpatch


LOGGER = logging.getLogger(__name__)


def pack_header(patch_type, compression):
    return bitstruct.pack('p1u3u4', patch_type, compression)


def create_compressor(compression,
                      heatshrink_window_sz2,
                      heatshrink_lookahead_sz2):
    if compression == 'lzma':
        compressor = lzma.LZMACompressor(format=lzma.FORMAT_ALONE)
    elif compression == 'bz2':
        compressor = BZ2Compressor()
    elif compression == 'none':
        compressor = NoneCompressor()
    elif compression == 'crle':
        compressor = CrleCompressor()
    elif compression == 'heatshrink':
        compressor = HeatshrinkCompressor(heatshrink_window_sz2,
                                          heatshrink_lookahead_sz2)
    elif compression == 'zstd':
        compressor = ZstdCompressor()
    elif compression == 'lz4':
        compressor = Lz4Compressor()
    else:
        raise Error(format_bad_compression_string(compression))

    return compressor


def create_suffix_array(suffix_array, data, suffix_array_algorithm):
    if suffix_array_algorithm == 'sais':
        sais(data, suffix_array)
    elif suffix_array_algorithm == 'divsufsort':
        divsufsort(data, suffix_array)
    else:
        raise Error('Bad suffix array algorithm {}.'.format(suffix_array_algorithm))


def temporary_file(size):
    fp = tempfile.TemporaryFile()
    fp.truncate(size)
    fp.flush()
    fp.seek(0)

    return fp


def mmap_read_only(fin):
    return mmap.mmap(fin.fileno(), 0, access=mmap.ACCESS_READ)


def mmap_read_write(fin):
    return mmap.mmap(fin.fileno(), 0)


def create_chunks_mmap(ffrom, fto, suffix_array_algorithm):
    LOGGER.debug('Creating chunks using mmap.')

    suffix_array_size = 4 * (file_size(ffrom) + 1)

    with mmap_read_only(ffrom) as from_mmap:
        with mmap_read_only(fto) as to_mmap:
            with temporary_file(suffix_array_size) as fsuffix_array:
                with mmap_read_write(fsuffix_array) as suffix_array_mmap:
                    start_time = time.time()
                    create_suffix_array(suffix_array_mmap,
                                        from_mmap,
                                        suffix_array_algorithm)

                    LOGGER.info('Suffix array of %s created in %s using mmap.',
                                format_size(suffix_array_size),
                                format_timespan(time.time() - start_time))

                    with temporary_file(file_size(fto) + 1) as fde:
                        with mmap_read_write(fde) as fde_mmap:
                            start_time = time.time()
                            chunks = bsdiff.create_patch(suffix_array_mmap,
                                                         from_mmap,
                                                         to_mmap,
                                                         fde_mmap)

                            LOGGER.info(
                                'Bsdiff algorithm completed in %s using mmap.',
                                format_timespan(time.time() - start_time))

    return chunks


def create_chunks_heap(ffrom, fto, suffix_array_algorithm):
    LOGGER.debug('Creating chunks using the heap.')

    from_data = file_read(ffrom)
    start_time = time.time()
    suffix_array = bytearray(4 * (len(from_data) + 1))
    create_suffix_array(suffix_array, from_data, suffix_array_algorithm)

    LOGGER.info('Suffix array of %s created in %s.',
                format_size(len(suffix_array)),
                format_timespan(time.time() - start_time))

    start_time = time.time()
    chunks = bsdiff.create_patch(suffix_array,
                                 from_data,
                                 file_read(fto),
                                 bytearray(file_size(fto) + 1))

    LOGGER.info('Bsdiff algorithm completed in %s.',
                format_timespan(time.time() - start_time))

    return chunks


def create_chunks(ffrom, fto, suffix_array_algorithm, use_mmap):
    if not use_mmap:
        return create_chunks_heap(ffrom, fto, suffix_array_algorithm)

    try:
        return create_chunks_mmap(ffrom, fto, suffix_array_algorithm)
    except (io.UnsupportedOperation, ValueError):
        return create_chunks_heap(ffrom, fto, suffix_array_algorithm)


def create_patch_sequential_data(ffrom,
                                 fto,
                                 fpatch,
                                 compression,
                                 suffix_array_algorithm,
                                 data_format,
                                 data_segment,
                                 use_mmap,
                                 heatshrink_window_sz2,
                                 heatshrink_lookahead_sz2):
    to_size = file_size(fto)

    if to_size == 0:
        return

    compressor = create_compressor(compression,
                                   heatshrink_window_sz2,
                                   heatshrink_lookahead_sz2)

    if data_format is None:
        dfpatch = pack_size(0)
    else:
        ffrom, fto, patch = data_format_encode(
            ffrom,
            fto,
            data_format,
            data_segment)

        dfpatch = pack_size(len(patch))
        dfpatch += pack_size(DATA_FORMATS[data_format])
        dfpatch += patch

    fpatch.write(compressor.compress(dfpatch))
    chunks = create_chunks(ffrom, fto, suffix_array_algorithm, use_mmap)
    start_time = time.time()

    for chunk in chunks:
        fpatch.write(compressor.compress(chunk))

    fpatch.write(compressor.flush())

    LOGGER.info('Compression (%s) completed in %s.',
                compression,
                format_timespan(time.time() - start_time))


def create_patch_sequential(ffrom,
                            fto,
                            fpatch,
                            compression,
                            suffix_array_algorithm,
                            data_format,
                            data_segment,
                            use_mmap,
                            heatshrink_window_sz2,
                            heatshrink_lookahead_sz2):
    fpatch.write(pack_header(PATCH_TYPE_SEQUENTIAL,
                             compression_string_to_number(compression)))
    fpatch.write(pack_size(file_size(fto)))
    create_patch_sequential_data(ffrom,
                                 fto,
                                 fpatch,
                                 compression,
                                 suffix_array_algorithm,
                                 data_format,
                                 data_segment,
                                 use_mmap,
                                 heatshrink_window_sz2,
                                 heatshrink_lookahead_sz2)


def calc_shift(memory_size, segment_size, minimum_shift_size, from_size):
    """Shift from data as many segments as possible.

    """

    memory_segments = div_ceil(memory_size, segment_size)
    from_segments = div_ceil(from_size, segment_size)

    shift_segments = (memory_segments - from_segments)
    shift_size = (shift_segments * segment_size)

    if shift_size < minimum_shift_size:
        shift_size = minimum_shift_size

    return shift_size


def create_patch_in_place(ffrom,
                          fto,
                          fpatch,
                          compression,
                          suffix_array_algorithm,
                          memory_size,
                          segment_size,
                          minimum_shift_size,
                          data_format,
                          data_segment,
                          use_mmap,
                          heatshrink_window_sz2,
                          heatshrink_lookahead_sz2):
    if (memory_size % segment_size) != 0:
        raise Error(
            'Memory size {} is not a multiple of segment size {}.'.format(
                memory_size,
                segment_size))

    if minimum_shift_size is None:
        minimum_shift_size = 2 * segment_size

    if (minimum_shift_size % segment_size) != 0:
        raise Error(
            'Minimum shift size {} is not a multiple of segment size {}.'.format(
                minimum_shift_size,
                segment_size))

    from_data = ffrom.read()
    from_size = len(from_data)
    to_data = fto.read()
    to_size = len(to_data)
    shift_size = calc_shift(memory_size,
                            segment_size,
                            minimum_shift_size,
                            len(from_data))
    shifted_size = (memory_size - shift_size)
    from_data = from_data[:shifted_size]
    number_of_to_segments = div_ceil(to_size, segment_size)

    # Create a sequential patch for each segment.
    fsegments = BytesIO()

    for segment in range(number_of_to_segments):
        to_offset = (segment * segment_size)
        from_offset = max(to_offset + segment_size - shift_size, 0)
        fsegment = BytesIO()
        create_patch_sequential_data(
            BytesIO(from_data[from_offset:]),
            BytesIO(to_data[to_offset:to_offset + segment_size]),
            fsegment,
            'none',
            suffix_array_algorithm,
            data_format,
            data_segment,
            use_mmap,
            heatshrink_window_sz2,
            heatshrink_lookahead_sz2)
        fsegments.write(fsegment.getvalue())

    # Create the patch.
    fpatch.write(pack_header(PATCH_TYPE_IN_PLACE,
                             compression_string_to_number(compression)))
    fpatch.write(pack_size(memory_size))
    fpatch.write(pack_size(segment_size))
    fpatch.write(pack_size(shift_size))
    fpatch.write(pack_size(from_size))
    fpatch.write(pack_size(to_size))

    if to_size == 0:
        return

    compressor = create_compressor(compression,
                                   heatshrink_window_sz2,
                                   heatshrink_lookahead_sz2)
    fpatch.write(compressor.compress(fsegments.getvalue()))
    fpatch.write(compressor.flush())


def offtout(x):
    if x < 0:
        x *= -1
        x |= (1 << 63)

    return struct.pack('<Q', x)


def create_patch_bsdiff(ffrom, fto, fpatch):
    to_size = file_size(fto)
    from_data = file_read(ffrom)
    start_time = time.time()
    suffix_array = bytearray(4 * (len(from_data) + 1))
    divsufsort(from_data, suffix_array)
    chunks = bsdiff.create_patch(suffix_array,
                                 from_data,
                                 file_read(fto),
                                 bytearray(file_size(fto) + 1))

    LOGGER.info('Bsdiff algorithm completed in %s.',
                format_timespan(time.time() - start_time))

    fctrl = BytesIO()
    fdiff = BytesIO()
    fextra = BytesIO()

    start_time = time.time()

    ctrl_compressor = BZ2Compressor()
    diff_compressor = BZ2Compressor()
    extra_compressor = BZ2Compressor()

    for i in range(0, len(chunks), 5):
        size = offtout(unpack_size_bytes(chunks[i + 0]))
        fctrl.write(ctrl_compressor.compress(size))
        fdiff.write(diff_compressor.compress(chunks[i + 1]))
        size = offtout(unpack_size_bytes(chunks[i + 2]))
        fctrl.write(ctrl_compressor.compress(size))
        fextra.write(extra_compressor.compress(chunks[i + 3]))
        size = offtout(unpack_size_bytes(chunks[i + 4]))
        fctrl.write(ctrl_compressor.compress(size))

    fctrl.write(ctrl_compressor.flush())
    fdiff.write(diff_compressor.flush())
    fextra.write(extra_compressor.flush())

    LOGGER.info('Compression completed in %s.',
                format_timespan(time.time() - start_time))

    # Write everything to the patch file.
    fpatch.write(b'BSDIFF40')
    fpatch.write(offtout(fctrl.tell()))
    fpatch.write(offtout(fdiff.tell()))
    fpatch.write(offtout(to_size))
    fpatch.write(fctrl.getvalue())
    fpatch.write(fdiff.getvalue())
    fpatch.write(fextra.getvalue())


def create_patch_hdiffpatch_generic(ffrom,
                                    fto,
                                    match_score,
                                    match_block_size,
                                    patch_type,
                                    use_mmap):
    if use_mmap:
        with mmap_read_only(ffrom) as from_mmap:
            with mmap_read_only(fto) as to_mmap:
                return hdiffpatch.create_patch(from_mmap,
                                               to_mmap,
                                               match_score,
                                               match_block_size,
                                               patch_type)
    else:
        return hdiffpatch.create_patch(file_read(ffrom),
                                       file_read(fto),
                                       match_score,
                                       match_block_size,
                                       patch_type)


def create_patch_hdiffpatch(ffrom,
                            fto,
                            fpatch,
                            compression,
                            match_score,
                            use_mmap,
                            heatshrink_window_sz2,
                            heatshrink_lookahead_sz2):
    start_time = time.time()
    patch = create_patch_hdiffpatch_generic(ffrom,
                                            fto,
                                            match_score,
                                            0,
                                            PATCH_TYPE_HDIFFPATCH,
                                            use_mmap)

    LOGGER.info('Hdiffpatch algorithm completed in %s.',
                format_timespan(time.time() - start_time))

    start_time = time.time()
    compressor = create_compressor(compression,
                                   heatshrink_window_sz2,
                                   heatshrink_lookahead_sz2)

    fpatch.write(pack_header(PATCH_TYPE_HDIFFPATCH,
                             compression_string_to_number(compression)))
    fpatch.write(pack_size(file_size(fto)))
    fpatch.write(pack_size(len(patch)))
    fpatch.write(compressor.compress(patch))
    fpatch.write(compressor.flush())

    LOGGER.info('Compression completed in %s.',
                format_timespan(time.time() - start_time))


def create_patch_match_blocks(ffrom,
                              fto,
                              fpatch,
                              compression,
                              patch_type,
                              match_block_size,
                              use_mmap,
                              heatshrink_window_sz2,
                              heatshrink_lookahead_sz2):
    start_time = time.time()
    patch = create_patch_hdiffpatch_generic(ffrom,
                                            fto,
                                            0,
                                            match_block_size,
                                            PATCH_TYPES[patch_type],
                                            use_mmap)

    LOGGER.info('Match blocks algorithm completed in %s.',
                format_timespan(time.time() - start_time))

    start_time = time.time()
    compressor = create_compressor(compression,
                                   heatshrink_window_sz2,
                                   heatshrink_lookahead_sz2)

    if patch_type == 'hdiffpatch':
        fpatch.write(pack_header(PATCH_TYPE_HDIFFPATCH,
                                 compression_string_to_number(compression)))
        fpatch.write(pack_size(file_size(fto)))
        fpatch.write(pack_size(len(patch)))
    elif patch_type == 'sequential':
        fpatch.write(pack_header(PATCH_TYPE_SEQUENTIAL,
                                 compression_string_to_number(compression)))
        fpatch.write(pack_size(file_size(fto)))
        fpatch.write(compressor.compress(pack_size(0)))
    else:
        raise Error('Bad patch type {}.'.format(patch_type))

    fpatch.write(compressor.compress(patch))
    fpatch.write(compressor.flush())

    LOGGER.info('Compression completed in %s.',
                format_timespan(time.time() - start_time))


[docs]def create_patch(ffrom, fto, fpatch, compression='lzma', patch_type='sequential', algorithm='bsdiff', suffix_array_algorithm='divsufsort', memory_size=None, segment_size=None, minimum_shift_size=None, data_format=None, from_data_offset_begin=0, from_data_offset_end=0, from_data_begin=0, from_data_end=0, from_code_begin=0, from_code_end=0, to_data_offset_begin=0, to_data_offset_end=0, to_data_begin=0, to_data_end=0, to_code_begin=0, to_code_end=0, match_score=6, match_block_size=64, use_mmap=True, heatshrink_window_sz2=8, heatshrink_lookahead_sz2=7): """Create a patch from `ffrom` to `fto` and write it to `fpatch`. All three arguments are file-like objects. `compression` must be ``'bz2'``, ``'crle'``, ``'lzma'``, ``'zstd'``, ``'lz4'`` or ``'none'``. `patch_type` must be ``'sequential'``, ``'in-place'`` or ``'bsdiff'``. `algorithm` must be ``'sequential'`` or ``'hdiffpatch'``. `suffix_array_algorithm` must be ``'sais'`` or ``'divsufsort'``. `memory_size`, `segment_size` and `minimum_shift_size` are used when creating an in-place patch. `match_score` is used by the hdiffpatch algorithm. Default 6. Recommended 0-4 for binary files and 4-9 for text files. `match_block_size` is used by the match-blocks algorithm. Default 64. Less memory is needed to create the patch, but the patch will be bigger. >>> ffrom = open('foo.old', 'rb') >>> fto = open('foo.new', 'rb') >>> fpatch = open('foo.patch', 'wb') >>> create_patch(ffrom, fto, fpatch) """ data_segment = DataSegment(from_data_offset_begin, from_data_offset_end, from_data_begin, from_data_end, from_code_begin, from_code_end, to_data_offset_begin, to_data_offset_end, to_data_begin, to_data_end, to_code_begin, to_code_end) if algorithm == 'bsdiff' and patch_type == 'sequential': create_patch_sequential(ffrom, fto, fpatch, compression, suffix_array_algorithm, data_format, data_segment, use_mmap, heatshrink_window_sz2, heatshrink_lookahead_sz2) elif algorithm == 'bsdiff' and patch_type == 'in-place': create_patch_in_place(ffrom, fto, fpatch, compression, suffix_array_algorithm, memory_size, segment_size, minimum_shift_size, data_format, data_segment, use_mmap, heatshrink_window_sz2, heatshrink_lookahead_sz2) elif algorithm == 'bsdiff' and patch_type == 'bsdiff': create_patch_bsdiff(ffrom, fto, fpatch) elif algorithm == 'hdiffpatch' and patch_type == 'hdiffpatch': create_patch_hdiffpatch(ffrom, fto, fpatch, compression, match_score, use_mmap, heatshrink_window_sz2, heatshrink_lookahead_sz2) elif algorithm == 'match-blocks': create_patch_match_blocks(ffrom, fto, fpatch, compression, patch_type, match_block_size, use_mmap, heatshrink_window_sz2, heatshrink_lookahead_sz2) else: raise Error( "Bad algorithm ({}) and patch type ({}) combination.".format( algorithm, patch_type))
[docs]def create_patch_filenames(fromfile, tofile, patchfile, compression='lzma', patch_type='sequential', algorithm='bsdiff', suffix_array_algorithm='divsufsort', memory_size=None, segment_size=None, minimum_shift_size=None, data_format=None, from_data_offset_begin=0, from_data_offset_end=0, from_data_begin=0, from_data_end=0, from_code_begin=0, from_code_end=0, to_data_offset_begin=0, to_data_offset_end=0, to_data_begin=0, to_data_end=0, to_code_begin=0, to_code_end=0, match_score=6, match_block_size=64, use_mmap=True, heatshrink_window_sz2=8, heatshrink_lookahead_sz2=7): """Same as :func:`~detools.create_patch()`, but with filenames instead of file-like objects. >>> create_patch_filenames('foo.old', 'foo.new', 'foo.patch') """ with open(fromfile, 'rb') as ffrom: with open(tofile, 'rb') as fto: with open(patchfile, 'wb') as fpatch: create_patch(ffrom, fto, fpatch, compression, patch_type, algorithm, suffix_array_algorithm, memory_size, segment_size, minimum_shift_size, data_format, from_data_offset_begin, from_data_offset_end, from_data_begin, from_data_end, from_code_begin, from_code_end, to_data_offset_begin, to_data_offset_end, to_data_begin, to_data_end, to_code_begin, to_code_end, match_score, match_block_size, use_mmap, heatshrink_window_sz2, heatshrink_lookahead_sz2)