ucftools/matlab/@ustar/ustar.m

478 lines
17 KiB
Matlab

classdef ustar < handle
% Low-level utilities for UNIX standard tar files.
properties (Access = public)
File % tar file name
IndexFile % index file name
IOMode % file opened in read-only or read-write mode?
NumberOfSubfiles % number of subfiles
end
properties (Access = private)
% File info
fileID
ioflag
subFile
subFileBeg
subFileSize
% Current subfile information
currentFile
currentMode
currentUID
currentGID
currentFileSize
currentModtime
currentLink
currentLinkname
currentUsername
currentGroupname
currentDevMajor
currentDevMinor
currentFileBeg
% Constants
scanBuffSize = 2^17; % buffer size of scanner (max. number of files in tar)
extrBuffSize = 4194304; % buffer size of extracter
blockSize = 512; % ustar block size (do not change)
end
%% ------------------------------------------------------------------------%%
%% CONSTRUCORS/DESTRUCTORS %%
%% ------------------------------------------------------------------------%%
methods(Access=public)
function obj = ustar()
% obj = ucf()
% Default contructor
obj.resetPublicProperties();
obj.resetPrivateProperties();
obj.resetCurrent();
end
function delete(obj)
% obj.delete()
% Default destructor
obj.close();
end
end
%% ------------------------------------------------------------------------%%
%% INITIALIZATION METHODS %%
%% ------------------------------------------------------------------------%%
methods(Access=public)
function open(obj,file)
% obj.open(file)
% Opens a file in read-only mode
obj.File = file;
obj.IOMode = 'read';
obj.ioflag = 'r';
obj.fileID = fopen(obj.File,obj.ioflag);
if obj.fileID<0
error('Unable to open file: %s',obj.File);
end
obj.scanArchive();
obj.resetCurrent();
end
function openIndexed(obj,tarfile,indexfile)
% obj.open(tarfile,indexfile)
% Opens a file in read-only mode while using available
% indexing data.
% Input
% tarfile path to TAR file
% indexfile path to index file (in json/msgpack/taridx format)
obj.File = tarfile;
obj.IndexFile = indexfile;
obj.IOMode = 'read';
obj.ioflag = 'r';
obj.fileID = fopen(obj.File,obj.ioflag);
if obj.fileID<0
error('Unable to open file: %s',obj.File);
end
obj.scanIndexFile();
obj.resetCurrent();
end
end
%% ------------------------------------------------------------------------%%
%% PUBLIC METHODS %%
%% ------------------------------------------------------------------------%%
methods(Access=public)
function close(obj)
% obj.close()
% Closes a file
if obj.fileID<0
return;
end
status = fclose(obj.fileID);
if status<0
warning('Unable to close file (exit code: %d)',status);
return;
end
obj.resetPublicProperties();
obj.resetPrivateProperties();
obj.resetCurrent();
obj.fileID = -1;
end
function [ptr] = pointer(obj,fname)
% [ptr] = obj.pointer(fname)
% Returns a 'pointer' to the requested file within the tar-ball
% which can be used to read the data without extracting.
% Input
% fname file name of subfile within tar-ball
% Output
% ptr pointer: [fid,first byte,number of bytes]
idx = obj.findSubfile(fname);
ptr = [obj.fileID,obj.subFileBeg(idx),obj.subFileSize(idx)];
end
function [fname,fsize] = list(obj)
% [fname,fsize] = obj.list()
% Returns a list of name/size of all subfiles within the tar-ball
% Output
% fname cell array with filenames
% fsize array with file sizes in bytes
fname = obj.subFile;
fsize = obj.subFileSize;
end
function extract(obj,fname,varargin)
% obj.extract(fname)
% Extracts the requested subfile to a standalone file.
% Input
% fname name of subfile
% ? outfile path of output file (default: fname)
par = inputParser;
addParamValue(par,'outfile',fname,@ischar);
parse(par,varargin{:});
outfile = par.Results.outfile;
idx = obj.findSubfile(fname);
fbeg = obj.subFileBeg(idx);
fsize = obj.subFileSize(idx);
fidw = fopen(outfile,'w');
fseek(obj.fileID,fbeg,'bof');
% Chunk the file
nchunk = ceil(fsize/obj.extrBuffSize);
nchunkFull = floor(fsize/obj.extrBuffSize);
nchunkPart = nchunk-nchunkFull;
for ichunk=1:nchunkFull
buff = fread(obj.fileID,[1,obj.extrBuffSize],'*uint8');
fwrite(fidw,buff);
end
if nchunkPart>0
sizeChunkPart = mod(fsize,obj.extrBuffSize);
buff = fread(obj.fileID,[1,sizeChunkPart],'*uint8');
fwrite(fidw,buff);
end
fclose(fidw);
end
function [flag] = isSubfile(obj,fname)
% [flag] = obj.isSubfile(fname)
% Checks if a subfile exists within tar-ball.
% Input
% fname name of subfile
flag = any(ismember(obj.subFile,fname));
end
function writeIndex(obj,outfile)
% obj.writeIndex(outfile)
% Write a index file for tar archive in custom '.taridx' format
% The format is:
% nsubfile int64
% [nsubfile times]
% subFileName 256*char
% subFileBeg int64
% subFileSize int64
% Input
% outfile name of index file to be written (with extension '.taridx')
fid = fopen(outfile,'wb');
fwrite(fid,obj.NumberOfSubfiles,'int64');
for ii=1:obj.NumberOfSubfiles
nchar = length(obj.subFile{ii});
subfile = blanks(256);
subfile(1:nchar) = obj.subFile{ii};
subfile(nchar+1) = 0;
fwrite(fid,subfile,'256*char');
fwrite(fid,obj.subFileBeg(ii),'int64');
fwrite(fid,obj.subFileSize(ii),'int64');
end
fclose(fid);
end
function [fname,foffset,fsize,nfile] = dumpIndex(obj)
% obj.dumpIndex()
% Get indexing data of tarfile
% Output
% fname cell array of file names
% foffset data offset within tar file
% fsize data size
% nfile number of files in archive
nfile = obj.NumberOfSubfiles;
fname = obj.subFile;
foffset = obj.subFileBeg;
fsize = obj.subFileSize;
end
end
%% ------------------------------------------------------------------------%%
%% PRIVATE METHODS %%
%% ------------------------------------------------------------------------%%
methods(Access=private)
function scanArchive(obj)
% obj.scanArchive()
% Scans the tar-ball for subfiles and stores meta-data in class variables.
obj.subFile = cell(obj.scanBuffSize,1);
obj.subFileBeg = zeros(obj.scanBuffSize,1);
obj.subFileSize = zeros(obj.scanBuffSize,1);
% Jump to start of file
fseek(obj.fileID,0,'bof');
% Loop over (unknown) number of subfiles and evaluate header
ii = 0;
while ~obj.checkEOF()
ii = ii+1;
obj.readHeader(true);
obj.subFile{ii} = obj.currentFile;
obj.subFileSize(ii) = obj.currentFileSize;
obj.subFileBeg(ii) = obj.currentFileBeg;
nblock = ceil(obj.currentFileSize/obj.blockSize);
fseek(obj.fileID,nblock*obj.blockSize,'cof');
end
% Truncate preallocated arrays
obj.NumberOfSubfiles = ii;
obj.subFile = obj.subFile(1:ii);
obj.subFileSize = obj.subFileSize(1:ii);
obj.subFileBeg = obj.subFileBeg(1:ii);
if obj.NumberOfSubfiles>obj.scanBuffSize
warning('Number of subfiles exceeds scanBuffSize.');
end
obj.resetCurrent();
end
function scanIndexFile(obj)
% obj.scanIndexFile()
% Reads tar meta-data from index file into class variables.
% Check encoding of indexing file
[~,~,fileExtension] = fileparts(obj.IndexFile);
switch fileExtension
case '.simplejson'
% Open and read file contents (ASCII)
indexfileID = fopen(obj.IndexFile,'r');
if indexfileID<0
error('Unable to open file: %s',obj.IndexFile);
end
fseek(indexfileID,0,'bof');
jsonstr = fread(indexfileID,'schar=>char')';
fclose(indexfileID);
% Parse JSON and reconstruct filenames
if ~isempty(which('jsonlab.loadjson'))
% User function from matlab central
% This function is preferred, since filenames can be
% reconstructed safely from parsed JSON (. <=> _0x2E_)
json = jsonlab.loadjson(jsonstr);
jsonFields = fieldnames(json);
tarFileName = strrep(jsonFields,'_0x2E_','.');
elseif ~isempty(which('jsondecode'))
% Built-in function
% Second choice only, since filename might be ambiguous
% if it has no extension, but contains underscore. (. => _)
json = jsondecode(jsonstr);
jsonFields = fieldnames(json);
idxtmp = strfind(jsonFields,'_');
tarFileName = jsonFields;
for ifield=1:length(jsonFields)
if ~isempty(idxtmp{ifield})
tarFileName{ifield}(idxtmp{ifield}(end)) = '.';
end
end
else % no JSON decoder available
error('No JSON decoder available.');
end
% Extract important fields
nsubfile = length(jsonFields);
for isub=1:nsubfile
tarFileOffset(isub) = json.(jsonFields{isub}).offset;
tarFileSize(isub) = json.(jsonFields{isub}).size;
end
case '.msgpack'
% Open and read file contents (binary)
indexfileID = fopen(obj.IndexFile,'rb');
if indexfileID<0
error('Unable to open file: %s',obj.IndexFile);
end
fseek(indexfileID,0,'bof');
msgbytes = fread(indexfileID,'uint8=>uint8');
fclose(indexfileID);
% Parse msgpack
if ~isempty(which('msgpack.parsemsgpack'))
msg = msgpack.parsemsgpack(msgbytes);
tarFileName = msg.keys;
nsubfile = length(tarFileName);
tarFileSize = zeros(1,nsubfile);
tarFileOffset = zeros(1,nsubfile);
for isub=1:nsubfile
tmp = msg(tarFileName{isub});
tarFileOffset(isub) = double(tmp{1});
tarFileSize(isub) = double(tmp{2});
end
else % no msgpack decoder available
error('No msgpack decoder available.');
end
case '.taridx'
% Open and read file contents (binary)
indexfileID = fopen(obj.IndexFile,'rb');
if indexfileID<0
error('Unable to open file: %s',obj.IndexFile);
end
fseek(indexfileID,0,'bof');
nsubfile = fread(indexfileID,1,'int64=>double');
tarFileName = cell(1,nsubfile);
tarFileSize = zeros(1,nsubfile);
tarFileOffset = zeros(1,nsubfile);
for isub=1:nsubfile
tarFileName{isub} = deblank(fread(indexfileID,[1,256],'schar=>char'));
tarFileOffset(isub) = fread(indexfileID,1,'int64=>double');
tarFileSize(isub) = fread(indexfileID,1,'int64=>double');
end
fclose(indexfileID);
otherwise
error('Unknown file extension of index file: %s',fileExtension);
end
% Order by offset, i.e. file order within tarball and assign
% to class variables
[~,idxsort] = sort(tarFileOffset);
obj.subFile = {tarFileName{idxsort}}';
obj.subFileBeg = tarFileOffset(idxsort)';
obj.subFileSize = tarFileSize(idxsort)';
obj.NumberOfSubfiles = nsubfile;
end
function readHeader(obj,scanMode)
% obj.readHeader(scanMode)
% Reads header data of a subfile in tar-ball and stores information
% in 'current*' class-variables.
% Input
% scanMode when set to true, omit parts which are not needed during scan
header = fread(obj.fileID,[1,obj.blockSize],'schar=>char');
% Extract header information
name = header(1:100);
mode = header(101:108);
uid = header(109:116);
gid = header(117:124);
fsize = header(125:136);
mtime = header(137:148);
chksum = header(149:156);
link = header(157);
linkname = header(158:257);
magic = header(258:263);
version = header(264:265);
uname = header(266:297);
gname = header(298:329);
devmajor = header(330:337);
devminor = header(338:345);
prefix = header(346:500);
% Evaluate checksum
chksum1 = ustar.computeChecksum(header);
chksum2 = ustar.parseOctalStr(chksum);
if chksum1~=chksum2
error('Checksum mismatch! %d,%d',chksum1,chksum2);
end
% Evaluate magic
%if ~strcmp(ustar.parseStr(magic),'ustar')
if isempty(strfind(ustar.parseStr(magic),'ustar'))
error(' Not a UNIX standard tar file.')
end
% Parse header information
obj.currentFile = ustar.parseStr([prefix,name]);
obj.currentFileBeg = ftell(obj.fileID);
obj.currentFileSize = ustar.parseOctalStr(fsize);
if ~scanMode
obj.currentMode = ustar.parseStr(mode);
obj.currentUID = ustar.parseOctalStr(uid);
obj.currentGID = ustar.parseOctalStr(gid);
obj.currentModtime = datestr(ustar.parseOctalStr(mtime)/86400+datenum(1970,1,1));
obj.currentLink = ustar.parseOctalStr(link);
obj.currentLinkname = ustar.parseStr(linkname);
obj.currentUsername = ustar.parseStr(uname);
obj.currentGroupname = ustar.parseStr(gname);
obj.currentDevMajor = ustar.parseOctalStr(devmajor);
obj.currentDevMinor = ustar.parseOctalStr(devminor);
end
end
function [isEOF] = checkEOF(obj)
% [isEOF] = obj.checkEOF()
% Checks if end-of-file is reached (two blocks of binary zeros).
% Output
% isEOF flag which indicates end-of-file
isEOF = false;
curPosition = ftell(obj.fileID);
blockref = zeros(1,obj.blockSize,'int8');
blockcur = fread(obj.fileID,[1,obj.blockSize],'int8=>int8');
if feof(obj.fileID)
isEOF = true;
end
if isequal(blockcur,blockref)
blockcur = fread(obj.fileID,[1,obj.blockSize],'int8=>int8');
if isequal(blockcur,blockref)
isEOF = true;
return;
end
end
fseek(obj.fileID,curPosition,'bof');
end
function [idx] = findSubfile(obj,fname)
% [idx] = obj.findSubfile(fname)
% Get index of requested subfile
% Input
% fname name of subfile
% Output
% idx index of subfile
isReqFile = ismember(obj.subFile,fname);
switch sum(isReqFile)
case 0; error('File not found: %s',fname);
case 1;
otherwise; warning('More than one matching file found.');
end
idx = find(isReqFile);
end
function resetPublicProperties(obj)
obj.File = [];
obj.IOMode = [];
obj.NumberOfSubfiles = [];
end
function resetPrivateProperties(obj)
obj.ioflag = [];
obj.subFile = [];
obj.subFileBeg = [];
obj.subFileSize = [];
end
function resetCurrent(obj)
obj.currentFile = [];
obj.currentMode = [];
obj.currentUID = [];
obj.currentGID = [];
obj.currentFileSize = [];
obj.currentModtime = [];
obj.currentLink = [];
obj.currentLinkname = [];
obj.currentUsername = [];
obj.currentGroupname = [];
obj.currentDevMajor = [];
obj.currentDevMinor = [];
obj.currentFileBeg = [];
end
end
%% ------------------------------------------------------------------------%%
%% PRIVATE STATIC METHODS %%
%% ------------------------------------------------------------------------%%
methods(Access=private,Static)
function [chksum] = computeChecksum(block)
block(149:156) = ' '; % checksum is computed with spaces in check sum field
chksum = sum(block);
end
function [str] = parseStr(str)
charZero = cast(0,'char');
str = strrep(str,charZero,'');
end
function [num] = parseOctalStr(str)
num = ustar.oct2dec_long(str2double(ustar.parseStr(str)));
end
function [dec] = oct2dec_long(oct)
dec = 0;
ii = 1;
while floor(oct/10^(ii-1))~=0
cbase = 8^(ii-1);
cfact = floor(mod(oct,10^ii)/10^(ii-1));
dec = dec + cfact*cbase;
ii = ii+1;
end
end
end
end