diff --git a/python/ratarindex.py b/python/ratarindex.py new file mode 100755 index 0000000..3dd238e --- /dev/null +++ b/python/ratarindex.py @@ -0,0 +1,496 @@ +#!/usr/bin/env python3 + +import os, re, sys, stat, tarfile, argparse +import itertools +from collections import namedtuple +from timeit import default_timer as timer + + +printDebug = 1 + +def overrides( parentClass ): + def overrider( method ): + assert( method.__name__ in dir( parentClass ) ) + return method + return overrider + + +FileInfo = namedtuple( "FileInfo", "offset size mtime mode type linkname uid gid istar" ) + + +class IndexedTar( object ): + """ + This class reads once through a whole TAR archive and stores TAR file offsets for all packed files + in an index to support fast seeking to a given file. + """ + + __slots__ = ( + 'tarFileName', + 'fileIndex', + 'mountRecursively', + 'cacheFolder', + 'possibleIndexFilePaths', + 'indexFileName', + ) + + # these allowed backends also double as extensions for the index file to look for + availableSerializationBackends = [ + 'pickle', + 'pickle2', + 'pickle3', + 'custom', + 'cbor', + 'msgpack', + 'rapidjson', + 'ujson', + 'simplejson' + ] + availableCompressions = [ + '', # no compression + 'lz4', + 'gz', + ] + + def __init__( self, pathToTar = None, fileObject = None, writeIndex = False, + recursive = False, serializationBackend = None ): + self.tarFileName = os.path.normpath( pathToTar ) + # Stores the file hierarchy in a dictionary with keys being either the file and containing file metainformation + # or keys being a folder name and containing recursively defined dictionary. + self.fileIndex = {} + self.mountRecursively = recursive + + self.cacheFolder = os.path.expanduser( "~/.ratarmount" ) # will be used for storing if current path is read-only + self.possibleIndexFilePaths = [ + self.tarFileName + ".index", + self.cacheFolder + "/" + self.tarFileName.replace( "/", "_" ) + ".index" + ] + + if serializationBackend not in self.supportedIndexExtensions(): + serializationBackend = 'custom' + print( "[Warning] Serialization backend not supported. Defaulting to '" + serializationBackend + "'!" ) + + # this is the actual index file, which will be used in the end, and by default + self.indexFileName = self.possibleIndexFilePaths[0] + "." + serializationBackend + + if fileObject is not None: + if writeIndex: + print( "Can't write out index for file object input. Ignoring this option." ) + self.createIndex( fileObject ) + else: + # first try loading the index for the given serialization backend + if serializationBackend is not None: + for indexPath in self.possibleIndexFilePaths: + indexPathWitExt = indexPath + "." + serializationBackend + + if self.indexIsLoaded(): + break + + if os.path.isfile( indexPathWitExt ): + if os.path.getsize( indexPathWitExt ) == 0: + os.remove( indexPathWitExt ) + else: + writeIndex = False + + if not self.indexIsLoaded(): + with open( self.tarFileName, 'rb' ) as file: + self.createIndex( file ) + + if writeIndex: + for indexPath in self.possibleIndexFilePaths: + indexPath += "." + serializationBackend + + try: + folder = os.path.dirname( indexPath ) + if not os.path.exists( folder ): + os.mkdir( folder ) + + f = open( indexPath, 'wb' ) + f.close() + os.remove( indexPath ) + self.indexFileName = indexPath + + break + except IOError: + if printDebug >= 2: + print( "Could not create file:", indexPath ) + + try: + self.writeIndex( self.indexFileName ) + except IOError: + print( "[Info] Could not write TAR index to file." ) + + @staticmethod + def supportedIndexExtensions(): + return [ '.'.join( combination ).strip( '.' ) + for combination in itertools.product( IndexedTar.availableSerializationBackends, + IndexedTar.availableCompressions ) ] + @staticmethod + def dump( toDump, file ): + if isinstance( toDump, dict ): + file.write( b'\x01' ) # magic code meaning "start dictionary object" + + for key, value in toDump.items(): + file.write( b'\x03' ) # magic code meaning "serialized key value pair" + IndexedTar.dump( key, file ) + IndexedTar.dump( value, file ) + + file.write( b'\x02' ) # magic code meaning "close dictionary object" + + elif isinstance( toDump, FileInfo ): + import msgpack + serialized = msgpack.dumps( toDump ) + file.write( b'\x05' ) # magic code meaning "msgpack object" + file.write( len( serialized ).to_bytes( 4, byteorder = 'little' ) ) + file.write( serialized ) + + elif isinstance( toDump, str ): + serialized = toDump.encode() + file.write( b'\x04' ) # magic code meaning "string object" + file.write( len( serialized ).to_bytes( 4, byteorder = 'little' ) ) + file.write( serialized ) + + else: + print( "Ignoring unsupported type to write:", toDump ) + + @staticmethod + def load( file ): + elementType = file.read( 1 ) + + if elementType == b'\x01': # start of dictionary + result = {} + + dictElementType = file.read( 1 ) + while len( dictElementType ) != 0: + if dictElementType == b'\x02': + break + + elif dictElementType == b'\x03': + import msgpack + + keyType = file.read( 1 ) + if keyType != b'\x04': # key must be string object + raise Exception( 'Custom TAR index loader: invalid file format' ) + size = int.from_bytes( file.read( 4 ), byteorder = 'little' ) + key = file.read( size ).decode() + + valueType = file.read( 1 ) + if valueType == b'\x05': # msgpack object + size = int.from_bytes( file.read( 4 ), byteorder = 'little' ) + serialized = file.read( size ) + value = FileInfo( *msgpack.loads( serialized ) ) + + elif valueType == b'\x01': # dict object + import io + file.seek( -1, io.SEEK_CUR ) + value = IndexedTar.load( file ) + + else: + raise Exception( 'Custom TAR index loader: invalid file format ' + + '(expected msgpack or dict but got' + + str( int.from_bytes( valueType, byteorder = 'little' ) ) + ')' ) + + result[key] = value + + else: + raise Exception( 'Custom TAR index loader: invalid file format ' + + '(expected end-of-dict or key-value pair but got' + + str( int.from_bytes( dictElementType, byteorder = 'little' ) ) + ')' ) + + dictElementType = file.read( 1 ) + + return result + + else: + raise Exception( 'Custom TAR index loader: invalid file format' ) + + def getFileInfo( self, path, listDir = False ): + # go down file hierarchy tree along the given path + p = self.fileIndex + for name in os.path.normpath( path ).split( os.sep ): + if not name: + continue + if not name in p: + return + p = p[name] + + def repackDeserializedNamedTuple( p ): + if isinstance( p, list ) and len( p ) == len( FileInfo._fields ): + return FileInfo( *p ) + elif isinstance( p, dict ) and len( p ) == len( FileInfo._fields ) and \ + 'uid' in p and isinstance( p['uid'], int ): + # a normal directory dict must only have dict or FileInfo values, so if the value to the 'uid' + # key is an actual int, then it is sure it is a deserialized FileInfo object and not a file named 'uid' + print( "P ===", p ) + print( "FileInfo ===", FileInfo( **p ) ) + return FileInfo( **p ) + return p + + p = repackDeserializedNamedTuple( p ) + + # if the directory contents are not to be printed and it is a directory, return the "file" info of "." + if not listDir and isinstance( p, dict ): + if '.' in p: + p = p['.'] + else: + return FileInfo( + offset = 0, # not necessary for directory anyways + size = 1, # might be misleading / non-conform + mtime = 0, + mode = 0o555 | stat.S_IFDIR, + type = tarfile.DIRTYPE, + linkname = "", + uid = 0, + gid = 0, + istar = False + ) + + return repackDeserializedNamedTuple( p ) + + def isDir( self, path ): + return True if isinstance( self.getFileInfo( path, listDir = True ), dict ) else False + + def exists( self, path ): + path = os.path.normpath( path ) + return self.isDir( path ) or isinstance( self.getFileInfo( path ), FileInfo ) + + def setFileInfo( self, path, fileInfo ): + """ + path: the full path to the file with leading slash (/) for which to set the file info + """ + assert( isinstance( fileInfo, FileInfo ) ) + + pathHierarchy = os.path.normpath( path ).split( os.sep ) + if len( pathHierarchy ) == 0: + return + + # go down file hierarchy tree along the given path + p = self.fileIndex + for name in pathHierarchy[:-1]: + if not name: + continue + assert( isinstance( p, dict ) ) + p = p.setdefault( name, {} ) + + # create a new key in the dictionary of the parent folder + p.update( { pathHierarchy[-1] : fileInfo } ) + + def setDirInfo( self, path, dirInfo, dirContents = {} ): + """ + path: the full path to the file with leading slash (/) for which to set the folder info + """ + assert( isinstance( dirInfo, FileInfo ) ) + assert( isinstance( dirContents, dict ) ) + + pathHierarchy = os.path.normpath( path ).strip( os.sep ).split( os.sep ) + if len( pathHierarchy ) == 0: + return + + # go down file hierarchy tree along the given path + p = self.fileIndex + for name in pathHierarchy[:-1]: + if not name: + continue + assert( isinstance( p, dict ) ) + p = p.setdefault( name, {} ) + + # create a new key in the dictionary of the parent folder + p.update( { pathHierarchy[-1] : dirContents } ) + p[pathHierarchy[-1]].update( { '.' : dirInfo } ) + + def createIndex( self, fileObject ): + if printDebug >= 1: + print( "Creating offset dictionary for", "" if self.tarFileName is None else self.tarFileName, "..." ) + t0 = timer() + + self.fileIndex = {} + try: + loadedTarFile = tarfile.open( fileobj = fileObject, mode = 'r:' ) + except tarfile.ReadError as exception: + print( "Archive can't be opened! This might happen for compressed TAR archives, which currently is not supported." ) + raise exception + + for tarInfo in loadedTarFile: + mode = tarInfo.mode + if tarInfo.isdir() : mode |= stat.S_IFDIR + if tarInfo.isfile(): mode |= stat.S_IFREG + if tarInfo.issym() : mode |= stat.S_IFLNK + if tarInfo.ischr() : mode |= stat.S_IFCHR + if tarInfo.isfifo(): mode |= stat.S_IFIFO + fileInfo = FileInfo( + offset = tarInfo.offset_data, + size = tarInfo.size , + mtime = tarInfo.mtime , + mode = mode , + type = tarInfo.type , + linkname = tarInfo.linkname , + uid = tarInfo.uid , + gid = tarInfo.gid , + istar = False + ) + + # open contained tars for recursive mounting + indexedTar = None + if self.mountRecursively and tarInfo.isfile() and tarInfo.name.endswith( ".tar" ): + oldPos = fileObject.tell() + if oldPos != tarInfo.offset_data: + fileObject.seek( tarInfo.offset_data ) + indexedTar = IndexedTar( tarInfo.name, fileObject = fileObject, writeIndex = False ) + fileObject.seek( fileObject.tell() ) # might be especially necessary if the .tar is not actually a tar! + + # Add a leading '/' as a convention where '/' represents the TAR root folder + # Partly, done because fusepy specifies paths in a mounted directory like this + path = os.path.normpath( "/" + tarInfo.name ) + + # test whether the TAR file could be loaded and if so "mount" it recursively + if indexedTar is not None and indexedTar.indexIsLoaded(): + # actually apply the recursive tar mounting + extractedName = re.sub( r"\.tar$", "", path ) + if not self.exists( extractedName ): + path = extractedName + + mountMode = ( fileInfo.mode & 0o777 ) | stat.S_IFDIR + if mountMode & stat.S_IRUSR != 0: mountMode |= stat.S_IXUSR + if mountMode & stat.S_IRGRP != 0: mountMode |= stat.S_IXGRP + if mountMode & stat.S_IROTH != 0: mountMode |= stat.S_IXOTH + fileInfo = fileInfo._replace( mode = mountMode, istar = True ) + + if self.exists( path ): + print( "[Warning]", path, "already exists in database and will be overwritten!" ) + + # merge fileIndex from recursively loaded TAR into our Indexes + self.setDirInfo( path, fileInfo, indexedTar.fileIndex ) + + elif path != '/': + # just a warning and check for the path already existing + if self.exists( path ): + fileInfo = self.getFileInfo( path, listDir = False ) + if fileInfo.istar: + # move recursively mounted TAR directory to original .tar name if there is a name-clash, + # e.g., when foo/ also exists in the TAR but foo.tar would be mounted to foo/. + # In this case, move that mount to foo.tar/ + self.setFileInfo( path + ".tar", fileInfo, self.getFileInfo( path, listDir = True ) ) + else: + print( "[Warning]", path, "already exists in database and will be overwritten!" ) + + # simply store the file or directory information from current TAR item + if tarInfo.isdir(): + self.setDirInfo( path, fileInfo, {} ) + else: + self.setFileInfo( path, fileInfo ) + + t1 = timer() + if printDebug >= 1: + print( "Creating offset dictionary for", "" if self.tarFileName is None else self.tarFileName, "took {:.2f}s".format( t1 - t0 ) ) + + def serializationBackendFromFileName( self, fileName ): + splitName = fileName.split( '.' ) + + if len( splitName ) > 2 and '.'.join( splitName[-2:] ) in self.supportedIndexExtensions(): + return '.'.join( splitName[-2:] ) + elif splitName[-1] in self.supportedIndexExtensions(): + return splitName[-1] + return None + + def indexIsLoaded( self ): + return True if self.fileIndex else False + + def writeIndex( self, outFileName ): + """ + outFileName: full file name with backend extension. Depending on the extension the serialization is chosen. + """ + + serializationBackend = self.serializationBackendFromFileName( outFileName ) + + if printDebug >= 1: + print( "Writing out TAR index using", serializationBackend, "to", outFileName, "..." ) + t0 = timer() + + fileMode = 'wt' if 'json' in serializationBackend else 'wb' + + if serializationBackend.endswith( '.lz4' ): + import lz4.frame + wrapperOpen = lambda x : lz4.frame.open( x, fileMode ) + elif serializationBackend.endswith( '.gz' ): + import gzip + wrapperOpen = lambda x : gzip.open( x, fileMode ) + else: + wrapperOpen = lambda x : open( x, fileMode ) + serializationBackend = serializationBackend.split( '.' )[0] + + # libraries tested but not working: + # - marshal: can't serialize namedtuples + # - hickle: for some reason, creates files almost 64x larger as pickle!? And also takes similarly longer + # - yaml: almost a 10 times slower and more memory usage and deserializes everything including ints to string + + with wrapperOpen( outFileName ) as outFile: + if serializationBackend == 'pickle2': + import pickle + pickle.dump( self.fileIndex, outFile ) + pickle.dump( self.fileIndex, outFile, protocol = 2 ) + + # default serialization because it has the fewest dependencies and because it was legacy default + elif serializationBackend == 'pickle3' or \ + serializationBackend == 'pickle' or \ + serializationBackend is None: + import pickle + pickle.dump( self.fileIndex, outFile ) + pickle.dump( self.fileIndex, outFile, protocol = 3 ) # 3 is default protocol + + elif serializationBackend == 'simplejson': + import simplejson + simplejson.dump( self.fileIndex, outFile, namedtuple_as_object = True ) + + elif serializationBackend == 'custom': + IndexedTar.dump( self.fileIndex, outFile ) + + elif serializationBackend in [ 'msgpack', 'cbor', 'rapidjson', 'ujson' ]: + import importlib + module = importlib.import_module( serializationBackend ) + getattr( module, 'dump' )( self.fileIndex, outFile ) + + else: + print( "Tried to save index with unsupported extension backend:", serializationBackend, "!" ) + + t1 = timer() + if printDebug >= 1: + print( "Writing out TAR index to", outFileName, "took {:.2f}s".format( t1 - t0 ), + "and is sized", os.stat( outFileName ).st_size, "B" ) + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + formatter_class = argparse.ArgumentDefaultsHelpFormatter, + description = '''\ + Create index for random access to files inside the tar which will be saved to .index.[. '_'>.index.[.