I have read several posts including this one. but none helped.
Here is the python code that I have currently that splits the file
my input file size is 15G and I am splitting it into 128MB. my computer has 8G memory
import sys
def read_line(f_object,terminal_byte):
line = ''.join(iter(lambda:f_object.read(1),terminal_byte))
line+="x01"
return line
def read_lines(f_object,terminal_byte):
tmp = read_line(f_object,terminal_byte)
while tmp:
yield tmp
tmp = read_line(f_object,terminal_byte)
def make_chunks(f_object,terminal_byte,max_size):
current_chunk = []
current_chunk_size = 0
for line in read_lines(f_object,terminal_byte):
current_chunk.append(line)
current_chunk_size += len(line)
if current_chunk_size > max_size:
yield "".join(current_chunk)
current_chunk = []
current_chunk_size = 0
if current_chunk:
yield ''.join(current_chunk)
inputfile=sys.argv[1]
with open(inputfile,"rb") as f_in:
for i,chunk in enumerate(make_chunks(f_in, bytes(chr(1)),1024*1000*128)):
with open("out%d.txt"%i,"wb") as f_out:
f_out.write(chunk)
when I execute the script, I get the following error:
Traceback (most recent call last):
File "splitter.py", line 30, in <module>
for i,chunk in enumerate(make_chunks(f_in, bytes(chr(1)),1024*1000*128)):
File "splitter.py", line 17, in make_chunks
for line in read_lines(f_object,terminal_byte):
File "splitter.py", line 12, in read_lines
tmp = read_line(f_object,terminal_byte)
File "splitter.py", line 4, in read_line
line = ''.join(iter(lambda:f_object.read(1),terminal_byte))
MemoryError
See Question&Answers more detail:
os 与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…