I would like to know if parallel file writing is efficient. Indeed, a hard disk has one usable read head at a time. Thus the HDD can to do one task at a time.
But below test (in python) contradict what I expecting:
The file to copy is around 1 Gb
Script 1 ( // task to read and write line by line 10 times a same file ):
#!/usr/bin/env python
from multiprocessing import Pool
def read_and_write( copy_filename ):
with open( "/env/cns/bigtmp1/ERR000916_2.fastq", "r") as fori:
with open( "/env/cns/bigtmp1/{}.fastq".format( copy_filename) , "w" ) as fout:
for line in fori:
fout.write( line + "
" )
return copy_filename
def main():
f_names = [ "test_jm_{}".format(i) for i in range( 0, 10 ) ]
pool = Pool(processes=4)
results = pool.map( read_and_write, f_names )
if __name__ == "__main__":
main()
script 2 ( task to read and write line by line 10 times a same file ):
#!/usr/bin/env python
def read_and_write( copy_filename ):
with open( "/env/cns/bigtmp1/ERR000916_2.fastq", "r") as fori:
with open( "/env/cns/bigtmp1/{}.fastq".format( copy_filename) , "w" ) as fout:
for line in fori:
fout.write( line + "
" )
return copy_filename
def main():
f_names = [ "test_jm_{}".format(i) for i in range( 0, 10 ) ]
for n in f_names:
result = read_and_write( n )
if __name__ == "__main__":
main()
script 3 ( // task to copy 10 times a same file ):
#!/usr/bin/env python
from shutil import copyfile
from multiprocessing import Pool
def read_and_write( copy_filename ):
copyfile( "/env/cns/bigtmp1/ERR000916_2.fastq", "/env/cns/bigtmp1/{}.fastq".format( copy_filename) )
return copy_filename
def main():
f_names = [ "test_jm_{}".format(i) for i in range( 0, 10 ) ]
pool = Pool(processes=4)
results = pool.map( read_and_write, f_names )
if __name__ == "__main__":
main()
script 4 ( task to copy 10 times a same file ):
#!/usr/bin/env python
from shutil import copyfile
def read_and_write( copy_filename ):
copyfile( "/env/cns/bigtmp1/ERR000916_2.fastq", "/env/cns/bigtmp1/{}.fastq".format( copy_filename) )
return copy_filename
def main():
f_names = [ "test_jm_{}".format(i) for i in range( 0, 10 ) ]
for n in f_names:
result = read_and_write( n )
if __name__ == "__main__":
main()
Results:
$ # // task to read and write line by line 10 times a same file
$ time python read_write_1.py
real 1m46.484s
user 3m40.865s
sys 0m29.455s
$ rm test_jm*
$ # task to read and write line by line 10 times a same file
$ time python read_write_2.py
real 4m16.530s
user 3m41.303s
sys 0m24.032s
$ rm test_jm*
$ # // task to copy 10 times a same file
$ time python read_write_3.py
real 1m35.890s
user 0m10.615s
sys 0m36.361s
$ rm test_jm*
$ # task to copy 10 times a same file
$ time python read_write_4.py
real 1m40.660s
user 0m7.322s
sys 0m25.020s
$ rm test_jm*
These basics results seems to show that // io read and write is more efficient.
Thanks for you light
See Question&Answers more detail:
os