1717from __future__ import absolute_import
1818
1919from typing import Iterable , List , Tuple # pylint: disable=unused-import
20+ import itertools
2021import logging
2122
2223import apache_beam as beam
3132from gcp_variant_transforms .beam_io import vcfio
3233
3334
34- def _get_file_sizes ( file_pattern ):
35+ def _get_file_size ( file_name ):
3536 # type: (str) -> List[FileSizeInfo]
36- file_sizes = []
37- match_result = filesystems .FileSystems .match ([file_pattern ])[0 ]
38- for file_metadata in match_result .metadata_list :
39- compression_type = filesystem .CompressionTypes .detect_compression_type (
40- file_metadata .path )
41- if compression_type != filesystem .CompressionTypes .UNCOMPRESSED :
42- logging .error ("VCF file %s is compressed; disk requirement estimator "
43- "will not be accurate." , file_metadata .path )
44- file_sizes .append ((file_metadata .path , file_metadata .size_in_bytes ,))
45- return file_sizes
37+ match_result = filesystems .FileSystems .match ([file_name ])[0 ]
38+ if len (match_result .metadata_list ) != 1 :
39+ raise IOError ("File name {} did not correspond to exactly 1 result. "
40+ "Instead, got {}." .format (file_name ,
41+ len (match_result .metadata_list )))
42+ file_metadata = match_result .metadata_list [0 ]
43+
44+ compression_type = filesystem .CompressionTypes .detect_compression_type (
45+ file_metadata .path )
46+ if compression_type != filesystem .CompressionTypes .UNCOMPRESSED :
47+ logging .error ("VCF file %s is compressed; disk requirement estimator "
48+ "will not be accurate." , file_metadata .path )
49+ return file_metadata .size_in_bytes
4650
4751
4852def _convert_variants_to_bytesize (variant ):
@@ -64,7 +68,7 @@ def estimate_encoded_file_size(self, raw_sample_size, encoded_sample_size):
6468 Given the raw_file_size and measurements of several VCF lines from the file,
6569 estimate how much disk the file will take after expansion due to encoding
6670 lines as `vcfio.Variant` objects. The encoded_sample_size will be set as
67- `self.encoded `.
71+ `self.encoded_size `.
6872
6973 This is a simple ratio problem, solving for encoded_sample_size which is
7074 the only unknown:
@@ -111,8 +115,11 @@ def extract_output(self, (raw, encoded)):
111115class _EstimateVcfSizeSource (filebasedsource .FileBasedSource ):
112116 """A source for estimating the encoded size of a VCF file in `vcf_to_bq`.
113117
114- This source first reads a limited number of variants from a set of VCF files,
115- then
118+ This source first obtains the raw file sizes of a set of VCF files. Then,
119+ the source reads a limited number of variants from a set of VCF files,
120+ both as raw strings and encoded `Variant` objects. Finally, the reader
121+ returns a single `FileSizeInfo` object with an estimate of the input size
122+ if all sizes had been encoded as `Variant` objects.
116123
117124 Lines that are malformed are skipped.
118125
@@ -142,7 +149,7 @@ def read_records(
142149 file_name , # type: str
143150 range_tracker # type: range_trackers.UnsplittableRangeTracker
144151 ):
145- # type: (...) -> Iterable[Tuple[str, str, vcfio.Variant] ]
152+ # type: (...) -> Iterable[FileSizeInfo ]
146153 """This "generator" only emits a single FileSizeInfo object per file."""
147154 vcf_parser_class = vcfio .get_vcf_parser (self ._vcf_parser_type )
148155 record_iterator = vcf_parser_class (
@@ -155,31 +162,34 @@ def read_records(
155162 buffer_size = self .DEFAULT_VCF_READ_BUFFER_SIZE ,
156163 skip_header_lines = 0 )
157164
158- _ , raw_file_size = _get_file_sizes (file_name )[ 0 ]
165+ _ , raw_file_size = _get_file_size (file_name )
159166
160167 # Open distinct channel to read lines as raw bytestrings.
161168 with filesystems .FileSystems .open (file_name ,
162- self ._compression_type ) as raw_reader :
163- raw_record = raw_reader .readline ()
164- while raw_record and raw_record .startswith ('#' ):
165- # Skip headers, assume header size is negligible.
166- raw_record = raw_reader .readline ()
167-
169+ self ._compression_type ) as raw_iterator :
168170 count , raw_size , encoded_size = 0 , 0 , 0
169- for encoded_record in record_iterator :
171+ for encoded_record , raw_record in itertools .izip (record_iterator ,
172+ raw_iterator ):
173+ while raw_record and raw_record .startswith ('#' ):
174+ # Skip headers. Assume that header size is negligible.
175+ raw_record = raw_iterator .next ()
176+ logging .debug (
177+ "Reading record for disk usage estimation. Encoded variant: %s\n "
178+ "Raw variant: %s" , encoded_record , raw_record )
170179 if count >= self ._sample_size :
171180 break
172181 if not isinstance (encoded_record , vcfio .Variant ):
173182 logging .error (
174183 "Skipping VCF line that could not be decoded as a "
175184 "`vcfio.Variant` in file %s: %s" , file_name , raw_record )
176185 continue
177-
178- raw_size += len (raw_record )
186+ # Encoding in `utf-8` should represent the string as one byte per char,
187+ # even for non-ASCII chars. Python adds significant overhead to the
188+ # bytesize of the full str object.
189+ raw_size += len (raw_record .encode ('utf-8' ))
179190 encoded_size += _convert_variants_to_bytesize (encoded_record )
180191 count += 1
181192
182- raw_record = raw_reader .readline () # Increment raw iterator.
183193 file_size_info = FileSizeInfo (file_name , raw_file_size )
184194 file_size_info .estimate_encoded_file_size (raw_size , encoded_size )
185195 yield file_size_info
0 commit comments