当前位置: 动力学知识库 > 问答 > 编程问答 >

compression - why python tarfile gz is not reducing filesize

问题描述:

So, I am trying to compress 3 text files each of 10MB to one file as tar.gz, but it doesn't seem to reduce the final tar.gz. Final tar.gz filesize is still 30MB.

Can anyone please tell me why this is happening? I have the highest level of compression

>>> import os

>>> import sys

>>> import tarfile

>>> import tempfile

tarmode="w:gz"):

''>>> size_in_mb = 10

>>>

>>> def compress_str_to_tar(tmppath, files_str, tarfileprefix, tarmode="w:gz"):

... ''' compress string contents in files and tar. finally creates a tar file in tmppath

... @param tmppath: (str) pathdirectory where temp files to be compressed will be created

... @param files_str: (dict) {filename: filecontent_in_str} these will be compressed

... @param tarfileprefix: (str) output filename (without suffix) of tar

... @param tarmode: (str) w:gz or w:bz2

... '''

... tar = tarfile.open(os.path.join(tmppath, tarfileprefix+'.tar.'+tarmode.split(':')[1]), tarmode, compresslevel=9)

... for filename in files_str:

... with open(os.path.join(tmppath, filename), 'wb') as tmpf:

... tmpf.write(files_str[filename])

... tar.add(os.path.join(tmppath, filename), arcname=filename)

... tar.close()

...

...

>>> mail_size = 0

>>> files_str = {}

>>> for i in range(3):

... d = os.urandom(1*size_in_mb*(10**6))

... files_str['attachment'+str(i)+'.txt'] = d

... mail_size += sys.getsizeof(d)

...

...

/10**6)

tmppath = tempfile.mkdtemp()

print('tar-tmppath', tmppath)

tarfileprefix = 'tmpfoobar'

compress_str_to_tar(tmppath, files_str, tarfileprefix, 'w:gz')

print('mail_size', float(sys.getsizeof(open(os.path.join(tmppath, tarfileprefix+'.tar.gz')).read()))/10**6)

>>> print('mail_size', float(mail_size)/10**6)

('mail_size', 30.000111)

>>>

>>> tmppath = tempfile.mkdtemp()

>>> print('tar-tmppath', tmppath)

('tar-tmppath', '/tmp/tmpndifyt')

>>> tarfileprefix = 'tmpfoobar'

>>> compress_str_to_tar(tmppath, files_str, tarfileprefix, 'w:gz')

>>> print('mail_size', float(sys.getsizeof(open(os.path.join(tmppath, tarfileprefix+'.tar.gz')).read()))/10**6)

('mail_size', 30.009782)

>>>

>>>

>>>

网友答案:

you're trying to compress some data generated by os.urandom which is random.

random data compresses very badly if the random function is good.

The principle of compression is identifying repeating patterns. The better the random algorithm is the less repeating patterns you'll find.

I recommend that you try with real files, or a random text generated from a given list of words (not random letters) and you'll have much better compression.

网友答案:

So, as @Jean stated, I was able to compress 3 files of 10MB with same repeated character to 0.02MB => d = ('1'*size_in_mb*10**6)

import os
import sys
import tarfile
import tempfile
size_in_mb = 10

def compress_str_to_tar(tmppath, files_str, tarfileprefix, tarmode="w:gz"):
    ''' compress string contents in files and tar. finally creates a tar file in tmppath
    @param tmppath: (str) pathdirectory where temp files to be compressed will be created
    @param files_str: (dict) {filename: filecontent_in_str} these will be compressed
    @param tarfileprefix: (str) output filename (without suffix) of tar
    @param tarmode: (str) w:gz or w:bz2
    '''
    tar = tarfile.open(os.path.join(tmppath, tarfileprefix+'.tar.'+tarmode.split(':')[1]), tarmode, compresslevel=9)
    for filename in files_str:
        with open(os.path.join(tmppath, filename), 'wb') as tmpf:
            tmpf.write(files_str[filename])
        tar.add(os.path.join(tmppath, filename), arcname=filename)
    tar.close()


mail_size = 0
files_str = {}
for i in range(3):
    d = ('1'*size_in_mb*10**6)
    files_str['attachment'+str(i)+'.txt'] = d
    mail_size += sys.getsizeof(d)


print('mail_size', float(mail_size)/10**6)

tmppath = tempfile.mkdtemp()
print('tar-tmppath', tmppath)
tarfileprefix = 'tmpfoobar'
compress_str_to_tar(tmppath, files_str, tarfileprefix, 'w:gz')
print('mail_size', float(sys.getsizeof(open(os.path.join(tmppath, tarfileprefix+'.tar.gz')).read()))/10**6)
il_size',>>> print('mail_size', float(mail_size)/10**6)
('mail_size', 30.000111)
>>>
>>> tmppath = tempfile.mkdtemp()
>>> print('tar-tmppath', tmppath)
('tar-tmppath', '/tmp/tmpA3r51N')
>>> tarfileprefix = 'tmpfoobar'
>>> compress_str_to_tar(tmppath, files_str, tarfileprefix, 'w:gz')
ize', float(sys.getsizeof(open(os.path.join(tmppath, tarfileprefix+'.tar.gz')).read()))/10**6)


>>> print('mail_size', float(sys.getsizeof(open(os.path.join(tmppath, tarfileprefix+'.tar.gz')).read()))/10**6)
('mail_size', 0.02958)
>>>
>>>
>>>
分享给朋友:
您可能感兴趣的文章:
随机阅读: