diff --git a/.dockstore.yml b/.dockstore.yml
index ce89e992252..62fca1abc19 100644
--- a/.dockstore.yml
+++ b/.dockstore.yml
@@ -183,6 +183,7 @@ workflows:
- master
- ah_var_store
- rsa_vs_1245
+ - hatcher_testing_parquet
tags:
- /.*/
- name: GvsPrepareRangesCallset
diff --git a/Dockerfile b/Dockerfile
index a513863128d..22f319086ad 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-ARG BASE_DOCKER=broadinstitute/gatk:gatkbase-3.1.0
+ARG BASE_DOCKER=broadinstitute/gatk:gatkbase-3.2.0
# stage 1 for constructing the GATK zip
FROM ${BASE_DOCKER} AS gradleBuild
@@ -93,8 +93,6 @@ RUN conda env create -n gatk -f /gatk/gatkcondaenv.yml && \
echo "source activate gatk" >> /gatk/gatkenv.rc && \
echo "source /gatk/gatk-completion.sh" >> /gatk/gatkenv.rc && \
conda clean -afy && \
- find /opt/miniconda/ -follow -type f -name '*.a' -delete && \
- find /opt/miniconda/ -follow -type f -name '*.pyc' -delete && \
rm -rf /root/.cache/pip
CMD ["bash", "--init-file", "/gatk/gatkenv.rc"]
diff --git a/build.gradle b/build.gradle
index f6600e81595..587b661a978 100644
--- a/build.gradle
+++ b/build.gradle
@@ -3,7 +3,7 @@
buildscript {
repositories {
mavenCentral()
- }
+ }
}
plugins {
@@ -16,6 +16,7 @@ plugins {
id "com.github.johnrengelman.shadow" version "8.1.1" //used to build the shadow and sparkJars
id "com.github.ben-manes.versions" version "0.12.0" //used for identifying dependencies that need updating
id 'com.palantir.git-version' version '0.5.1' //version helper
+ id 'org.sonatype.gradle.plugins.scan' version '2.6.1' // scans for security vulnerabilities in our dependencies
}
@@ -56,20 +57,20 @@ repositories {
mavenLocal()
}
-final htsjdkVersion = System.getProperty('htsjdk.version','3.0.5')
-final picardVersion = System.getProperty('picard.version','3.1.0')
+final htsjdkVersion = System.getProperty('htsjdk.version','4.1.0')
+final picardVersion = System.getProperty('picard.version','3.1.1')
final barclayVersion = System.getProperty('barclay.version','5.0.0')
-final sparkVersion = System.getProperty('spark.version', '3.3.1')
-final hadoopVersion = System.getProperty('hadoop.version', '3.3.1')
-final disqVersion = System.getProperty('disq.version','0.3.6')
-final genomicsdbVersion = System.getProperty('genomicsdb.version','1.5.0')
-final bigQueryVersion = System.getProperty('bigQuery.version', '2.31.0')
-final bigQueryStorageVersion = System.getProperty('bigQueryStorage.version', '2.41.0')
-final guavaVersion = System.getProperty('guava.version', '32.1.2-jre')
+final sparkVersion = System.getProperty('spark.version', '3.5.0')
+final hadoopVersion = System.getProperty('hadoop.version', '3.3.6')
+final disqVersion = System.getProperty('disq.version','0.3.8')
+final genomicsdbVersion = System.getProperty('genomicsdb.version','1.5.1')
+final bigQueryVersion = System.getProperty('bigQuery.version', '2.35.0')
+final bigQueryStorageVersion = System.getProperty('bigQueryStorage.version', '2.47.0')
+final guavaVersion = System.getProperty('guava.version', '32.1.3-jre')
final log4j2Version = System.getProperty('log4j2Version', '2.17.1')
final testNGVersion = '7.0.0'
-final googleCloudNioDependency = 'com.google.cloud:google-cloud-nio:0.127.0'
+final googleCloudNioDependency = 'com.google.cloud:google-cloud-nio:0.127.8'
final baseJarName = 'gatk'
final secondaryBaseJarName = 'hellbender'
@@ -109,7 +110,7 @@ def resolveLargeResourceStubFiles(largeResourcesFolder, buildPrerequisitesMessag
} catch (IOException e) {
throw new GradleException(
"An IOException occurred while attempting to execute the command $gitLFSExecCommand."
- + " git-lfs is required to build GATK but may not be installed. $buildPrerequisitesMessage", e)
+ + " git-lfs is required to build GATK but may not be installed. $buildPrerequisitesMessage", e)
}
}
@@ -187,8 +188,8 @@ configurations.all {
}
tasks.withType(JavaCompile) {
- options.compilerArgs = ['-proc:none', '-Xlint:all', '-Werror', '-Xdiags:verbose']
- options.encoding = 'UTF-8'
+ options.compilerArgs = ['-proc:none', '-Xlint:all', '-Werror', '-Xdiags:verbose']
+ options.encoding = 'UTF-8'
}
sourceSets {
@@ -267,12 +268,12 @@ dependencies {
// are routed to log4j
implementation 'org.apache.logging.log4j:log4j-jcl:' + log4j2Version
- implementation 'org.apache.commons:commons-lang3:3.5'
- implementation 'org.apache.commons:commons-math3:3.5'
+ implementation 'org.apache.commons:commons-lang3:3.14.0'
+ implementation 'org.apache.commons:commons-math3:3.6.1'
implementation 'org.hipparchus:hipparchus-stat:2.0'
- implementation 'org.apache.commons:commons-collections4:4.1'
- implementation 'org.apache.commons:commons-vfs2:2.0'
- implementation 'org.apache.commons:commons-configuration2:2.4'
+ implementation 'org.apache.commons:commons-collections4:4.4'
+ implementation 'org.apache.commons:commons-vfs2:2.9.0'
+ implementation 'org.apache.commons:commons-configuration2:2.9.0'
constraints {
implementation('org.apache.commons:commons-text') {
version {
@@ -287,7 +288,7 @@ dependencies {
implementation 'commons-io:commons-io:2.5'
implementation 'org.reflections:reflections:0.9.10'
- implementation 'it.unimi.dsi:fastutil:7.0.6'
+ implementation 'it.unimi.dsi:fastutil:7.0.13'
implementation 'org.broadinstitute:hdf5-java-bindings:1.1.0-hdf5_2.11.0'
implementation 'org.broadinstitute:gatk-native-bindings:1.0.0'
@@ -297,8 +298,8 @@ dependencies {
exclude group: 'org.apache.commons'
}
- //there is no mllib_2.12.15:3.3.0, so stay use 2.12:3.3.0
- implementation ('org.apache.spark:spark-mllib_2.12:3.3.0') {
+ // TODO: migrate to mllib_2.12.15?
+ implementation ('org.apache.spark:spark-mllib_2.12:' + sparkVersion) {
// JUL is used by Google Dataflow as the backend logger, so exclude jul-to-slf4j to avoid a loop
exclude module: 'jul-to-slf4j'
exclude module: 'javax.servlet'
@@ -344,15 +345,22 @@ dependencies {
implementation 'org.broadinstitute:gatk-bwamem-jni:1.0.4'
implementation 'org.broadinstitute:gatk-fermilite-jni:1.2.0'
- implementation 'org.broadinstitute:http-nio:0.1.0-rc1'
+ implementation 'org.broadinstitute:http-nio:1.1.0'
// Required for COSMIC Funcotator data source:
- implementation 'org.xerial:sqlite-jdbc:3.36.0.3'
+ implementation 'org.xerial:sqlite-jdbc:3.44.1.0'
// natural sort
implementation('net.grey-panther:natural-comparator:1.1')
implementation('com.fasterxml.jackson.module:jackson-module-scala_2.12:2.9.8')
+ // parquet writing
+ implementation('org.apache.parquet:parquet-common:1.13.1')
+ implementation('org.apache.parquet:parquet-encoding:1.13.1')
+ implementation('org.apache.parquet:parquet-column:1.13.1')
+ implementation('org.apache.parquet:parquet-hadoop:1.13.1')
+ implementation 'org.apache.parquet:parquet-avro:1.13.1'
+
testUtilsImplementation sourceSets.main.output
testUtilsImplementation 'org.testng:testng:' + testNGVersion
testUtilsImplementation 'org.apache.hadoop:hadoop-minicluster:' + hadoopVersion
@@ -426,20 +434,20 @@ final runtimeAddOpens = [
'java.base/jdk.internal.module=ALL-UNNAMED',
'java.base/java.lang.module=ALL-UNNAMED',
'java.security.jgss/sun.security.krb5=ALL-UNNAMED'
- ]
+]
final testAddOpens = [
'java.prefs/java.util.prefs=ALL-UNNAMED' // required for jacoco tasks
]
run {
- // transform the list of runtime configuration --add-opens args into command line argument format
- final runtimeJVMArgs = runtimeAddOpens.stream()
- .flatMap(openSpec -> ['--add-opens', openSpec].stream())
- .toList()
- // add in any other required args
- runtimeJVMArgs.add('-Dio.netty.tryReflectionSetAccessible=true')
- jvmArgs = runtimeJVMArgs
+ // transform the list of runtime configuration --add-opens args into command line argument format
+ final runtimeJVMArgs = runtimeAddOpens.stream()
+ .flatMap(openSpec -> ['--add-opens', openSpec].stream())
+ .toList()
+ // add in any other required args
+ runtimeJVMArgs.add('-Dio.netty.tryReflectionSetAccessible=true')
+ jvmArgs = runtimeJVMArgs
}
test {
@@ -981,6 +989,18 @@ task gatkValidateGeneratedWdl(dependsOn: [gatkWDLGen, shadowJar]) {
}
}
+// scan-gradle-plugin security vulnerability scan
+ossIndexAudit {
+ allConfigurations = false // if true includes the dependencies in all resolvable configurations. By default is false, meaning only 'compileClasspath', 'runtimeClasspath', 'releaseCompileClasspath' and 'releaseRuntimeClasspath' are considered
+ useCache = true // true by default
+ outputFormat = 'DEFAULT' // Optional, other values are: 'DEPENDENCY_GRAPH' prints dependency graph showing direct/transitive dependencies, 'JSON_CYCLONE_DX_1_4' prints a CycloneDX 1.4 SBOM in JSON format.
+ showAll = false // if true prints all dependencies. By default is false, meaning only dependencies with vulnerabilities will be printed.
+ printBanner = true // if true will print ASCII text banner. By default is true.
+
+ // ossIndexAudit can be configured to exclude vulnerabilities from matching
+ // excludeVulnerabilityIds = ['39d74cc8-457a-4e57-89ef-a258420138c5'] // list containing ids of vulnerabilities to be ignored
+ // excludeCoordinates = ['commons-fileupload:commons-fileupload:1.3'] // list containing coordinate of components which if vulnerable should be ignored
+}
/**
*This specifies what artifacts will be built and uploaded when performing a maven upload.
diff --git a/build_docker_remote.sh b/build_docker_remote.sh
index 72534a02b03..7eada9565c3 100755
--- a/build_docker_remote.sh
+++ b/build_docker_remote.sh
@@ -1,12 +1,26 @@
#!/usr/bin/env bash
#
-# Build (and optionally push) a GATK docker image to GCR using Google Cloud Build. Images are built in the cloud rather than locally. Pushing to dockerhub is not supported by this script.
+# Build (and optionally push) a GATK docker image to GCR using Google Cloud Build. Images are built
+# in the cloud rather than locally. Pushing to dockerhub is not supported by this script.
#
-# If you are pushing an image to our release repositories, be sure that you've followed
-# the setup instructions here:
-# https://github.com/broadinstitute/gatk/wiki/How-to-release-GATK4#setup_docker
-# and here:
-# https://github.com/broadinstitute/gatk/wiki/How-to-release-GATK4#setup_gcloud
+# By default the images are pushed to the following GCR repository:
+#
+# us.gcr.io/broad-dsde-methods/broad-gatk-snapshots/gatk-remote-builds
+#
+# with a name like "${YOUR_USERNAME}-${GITHUB_TAG}-${GIT_HASH_FOR_TAG}"
+#
+# This script should not be used to push to our release repositories. After
+# you've built and staged an image, run the scripts/docker/release_prebuilt_docker_image.sh
+# script to push to the release repositories.
+#
+# Usage: build_docker_remote.sh -e
* To download and extract the data sources, you can invoke {@link FuncotatorDataSourceDownloader} in the following ways:
*
- *
* {@code ./gatk FuncotatorDataSourceDownloader --somatic --validate-integrity --extract-after-download}{@code ./gatk FuncotatorDataSourceDownloader --germline --validate-integrity --extract-after-download}{@code ./gatk FuncotatorDataSourceDownloader --somatic --validate-integrity --hg38 --extract-after-download}{@code ./gatk FuncotatorDataSourceDownloader --germline --validate-integrity --hg19 --extract-after-download}
* sample1 sample1.vcf.gz
* sample2 sample2.vcf.gz sample2.vcf.gz.tbi
@@ -218,12 +216,14 @@ public final class GenomicsDBImport extends GATKTool {
public static final String MAX_NUM_INTERVALS_TO_IMPORT_IN_PARALLEL = "max-num-intervals-to-import-in-parallel";
public static final String MERGE_CONTIGS_INTO_NUM_PARTITIONS = "merge-contigs-into-num-partitions";
public static final String BYPASS_FEATURE_READER = "bypass-feature-reader";
+ public static final String VCF_HEADER_OVERRIDE = "header";
public static final int INTERVAL_LIST_SIZE_WARNING_THRESHOLD = 100;
public static final int ARRAY_COLUMN_BOUNDS_START = 0;
public static final int ARRAY_COLUMN_BOUNDS_END = 1;
public static final String SHARED_POSIXFS_OPTIMIZATIONS = GenomicsDBArgumentCollection.SHARED_POSIXFS_OPTIMIZATIONS;
public static final String USE_GCS_HDFS_CONNECTOR = GenomicsDBArgumentCollection.USE_GCS_HDFS_CONNECTOR;
+ public static final String AVOID_NIO = "avoid-nio";
@Argument(fullName = WORKSPACE_ARG_LONG_NAME,
doc = "Workspace for GenomicsDB. Can be a POSIX file system absolute or relative path or a HDFS/GCS URL. " +
@@ -239,7 +239,7 @@ public final class GenomicsDBImport extends GATKTool {
"when using the "+INTERVAL_LIST_LONG_NAME+" option. " +
"Either this or "+WORKSPACE_ARG_LONG_NAME+" must be specified. " +
"Must point to an existing workspace.",
- mutex = {WORKSPACE_ARG_LONG_NAME})
+ mutex = {WORKSPACE_ARG_LONG_NAME, VCF_HEADER_OVERRIDE})
private String incrementalImportWorkspace;
@Argument(fullName = SEGMENT_SIZE_ARG_LONG_NAME,
@@ -254,7 +254,7 @@ public final class GenomicsDBImport extends GATKTool {
" data for only a single sample. Either this or " + SAMPLE_NAME_MAP_LONG_NAME +
" must be specified.",
optional = true,
- mutex = {SAMPLE_NAME_MAP_LONG_NAME})
+ mutex = {SAMPLE_NAME_MAP_LONG_NAME, AVOID_NIO})
private List variantPaths;
@Argument(fullName = VCF_BUFFER_SIZE_ARG_NAME,
@@ -364,6 +364,13 @@ public final class GenomicsDBImport extends GATKTool {
optional = true)
private boolean sharedPosixFSOptimizations = false;
+ @Argument(fullName = VCF_HEADER_OVERRIDE,
+ doc = "Specify a vcf file to use instead of reading and combining headers from the input vcfs",
+ optional = true,
+ mutex ={INCREMENTAL_WORKSPACE_ARG_LONG_NAME}
+ )
+ private FeatureInput headerOverride = null;
+
@Argument(fullName = BYPASS_FEATURE_READER,
doc = "Use htslib to read input VCFs instead of GATK's FeatureReader. This will reduce memory usage and potentially speed up " +
"the import. Lower memory requirements may also enable parallelism through " + MAX_NUM_INTERVALS_TO_IMPORT_IN_PARALLEL +
@@ -371,6 +378,15 @@ public final class GenomicsDBImport extends GATKTool {
optional = true)
private boolean bypassFeatureReader = false;
+ @Argument(fullName = AVOID_NIO,
+ doc = "Do not attempt to open the input vcf file paths in java. This can only be used with " + BYPASS_FEATURE_READER
+ + ". It allows operating on file systems which GenomicsDB understands how to open but GATK does not. This will disable "
+ + "many of the sanity checks.",
+ mutex = {StandardArgumentDefinitions.VARIANT_LONG_NAME}
+ )
+ @Advanced
+ private boolean avoidNio = false;
+
@Argument(fullName = USE_GCS_HDFS_CONNECTOR,
doc = "Use the GCS HDFS Connector instead of the native GCS SDK client with gs:// URLs.",
optional = true)
@@ -440,10 +456,6 @@ public int getDefaultCloudIndexPrefetchBufferSize() {
// Path to combined VCF header file to be written by GenomicsDBImporter
private String vcfHeaderFile;
- // GenomicsDB callset map protobuf structure containing all callset names
- // used to write the callset json file on traversal success
- private GenomicsDBCallsetsMapProto.CallsetMappingPB callsetMappingPB;
-
//in-progress batchCount
private int batchCount = 1;
@@ -463,11 +475,14 @@ public void onStartup() {
initializeWorkspaceAndToolMode();
assertVariantPathsOrSampleNameFileWasSpecified();
assertOverwriteWorkspaceAndIncrementalImportMutuallyExclusive();
+ assertAvoidNioConditionsAreValid();
initializeHeaderAndSampleMappings();
initializeIntervals();
super.onStartup();
}
+
+
private void initializeWorkspaceAndToolMode() {
if (incrementalImportWorkspace != null && !incrementalImportWorkspace.isEmpty()) {
doIncrementalImport = true;
@@ -495,6 +510,24 @@ private void assertVariantPathsOrSampleNameFileWasSpecified(){
}
}
+ private void assertAvoidNioConditionsAreValid() {
+ if (avoidNio && (!bypassFeatureReader || headerOverride == null) ){
+ final List missing = new ArrayList<>();
+ if(!bypassFeatureReader){
+ missing.add(BYPASS_FEATURE_READER);
+ }
+ if(headerOverride == null){
+ missing.add(VCF_HEADER_OVERRIDE);
+ }
+ final String missingArgs = String.join(" and ", missing);
+
+ // this potentially produces and exception with bad grammar but that's probably ok
+ throw new CommandLineException.MissingArgument(missingArgs, "If --" +AVOID_NIO + " is set then --" + BYPASS_FEATURE_READER
+ + " and --" + VCF_HEADER_OVERRIDE + " must also be specified.");
+
+ }
+ }
+
private static void assertIntervalsCoverEntireContigs(GenomicsDBImporter importer,
List intervals) {
GenomicsDBVidMapProto.VidMappingPB vidMapPB = importer.getProtobufVidMapping();
@@ -523,32 +556,37 @@ private static void assertIntervalsCoverEntireContigs(GenomicsDBImporter importe
*/
private void initializeHeaderAndSampleMappings() {
// Only one of -V and --sampleNameMapFile may be specified
- if (variantPaths != null && variantPaths.size() > 0) {
+ if (variantPaths != null && !variantPaths.isEmpty()) {
// -V was specified
final List headers = new ArrayList<>(variantPaths.size());
sampleNameMap = new SampleNameMap();
- for (final String variantPathString : variantPaths) {
- final Path variantPath = IOUtils.getPath(variantPathString);
- if (bypassFeatureReader) {
- GATKGenomicsDBUtils.assertVariantFileIsCompressedAndIndexed(variantPath);
- }
- final VCFHeader header = getHeaderFromPath(variantPath, null);
- Utils.validate(header != null, "Null header was found in " + variantPath + ".");
- assertGVCFHasOnlyOneSample(variantPathString, header);
- headers.add(header);
- final String sampleName = header.getGenotypeSamples().get(0);
- try {
- sampleNameMap.addSample(sampleName, new URI(variantPathString));
- }
- catch(final URISyntaxException e) {
- throw new UserException("Malformed URI "+e.toString(), e);
+ if(headerOverride == null) {
+ for (final String variantPathString : variantPaths) {
+ final Path variantPath = IOUtils.getPath(variantPathString);
+ if (bypassFeatureReader) { // avoid-nio can't be set here because it requires headerOverride
+ GATKGenomicsDBUtils.assertVariantFileIsCompressedAndIndexed(variantPath);
+ }
+ final VCFHeader header = getHeaderFromPath(variantPath);
+ Utils.validate(header != null, "Null header was found in " + variantPath + ".");
+ assertGVCFHasOnlyOneSample(variantPathString, header);
+ headers.add(header);
+
+ final String sampleName = header.getGenotypeSamples().get(0);
+ try {
+ sampleNameMap.addSample(sampleName, new URI(variantPathString));
+ } catch (final URISyntaxException e) {
+ throw new UserException("Malformed URI " + e.getMessage(), e);
+ }
}
+ mergedHeaderLines = VCFUtils.smartMergeHeaders(headers, true);
+ mergedHeaderSequenceDictionary = new VCFHeader(mergedHeaderLines).getSequenceDictionary();
+ } else {
+ final VCFHeader header = getHeaderFromPath(headerOverride.toPath());
+ mergedHeaderLines = new LinkedHashSet<>(header.getMetaDataInInputOrder());
+ mergedHeaderSequenceDictionary = header.getSequenceDictionary();
}
- mergedHeaderLines = VCFUtils.smartMergeHeaders(headers, true);
- mergedHeaderSequenceDictionary = new VCFHeader(mergedHeaderLines).getSequenceDictionary();
mergedHeaderLines.addAll(getDefaultToolVCFHeaderLines());
-
} else if (sampleNameMapFile != null) {
// --sampleNameMap was specified
@@ -556,31 +594,34 @@ private void initializeHeaderAndSampleMappings() {
//the resulting database will have incorrect sample names
//see https://github.com/broadinstitute/gatk/issues/3682 for more information
// The SampleNameMap class guarantees that the samples will be sorted correctly.
- sampleNameMap = new SampleNameMap(IOUtils.getPath(sampleNameMapFile), bypassFeatureReader);
+ sampleNameMap = new SampleNameMap(IOUtils.getPath(sampleNameMapFile),
+ bypassFeatureReader && !avoidNio);
final String firstSample = sampleNameMap.getSampleNameToVcfPath().entrySet().iterator().next().getKey();
- final Path firstVCFPath = sampleNameMap.getVCFForSampleAsPath(firstSample);
- final Path firstVCFIndexPath = sampleNameMap.getVCFIndexForSampleAsPath(firstSample);
- final VCFHeader header = getHeaderFromPath(firstVCFPath, firstVCFIndexPath);
+ final VCFHeader header;
+ if(headerOverride == null){
+ final Path firstVCFPath = sampleNameMap.getVCFForSampleAsPath(firstSample);
+ header = getHeaderFromPath(firstVCFPath);
+ } else {
+ header = getHeaderFromPath(headerOverride.toPath());
+ }
//getMetaDataInInputOrder() returns an ImmutableSet - LinkedHashSet is mutable and preserves ordering
- mergedHeaderLines = new LinkedHashSet(header.getMetaDataInInputOrder());
+ mergedHeaderLines = new LinkedHashSet<>(header.getMetaDataInInputOrder());
mergedHeaderSequenceDictionary = header.getSequenceDictionary();
mergedHeaderLines.addAll(getDefaultToolVCFHeaderLines());
- }
- else if (getIntervalsFromExistingWorkspace){
+ } else if (getIntervalsFromExistingWorkspace){
final String vcfHeader = IOUtils.appendPathToDir(workspace, GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME);
IOUtils.assertPathsAreReadable(vcfHeader);
final String header = GenomicsDBUtils.readEntireFile(vcfHeader);
try {
File tempHeader = IOUtils.createTempFile("tempheader", ".vcf");
- Files.write(tempHeader.toPath(), header.getBytes(StandardCharsets.UTF_8));
+ Files.writeString(tempHeader.toPath(), header);
mergedHeaderSequenceDictionary = VCFFileReader.getSequenceDictionary(tempHeader);
} catch (final IOException e) {
- throw new UserException("Unable to create temporary header file to get sequence dictionary");
+ throw new UserException("Unable to create temporary header file to get sequence dictionary", e);
}
- }
- else {
+ } else {
throw new UserException(StandardArgumentDefinitions.VARIANT_LONG_NAME+" or "+
SAMPLE_NAME_MAP_LONG_NAME+" must be specified unless "+
INTERVAL_LIST_LONG_NAME+" is specified");
@@ -599,8 +640,12 @@ else if (getIntervalsFromExistingWorkspace){
}
}
- private VCFHeader getHeaderFromPath(final Path variantPath, final Path variantIndexPath) {
- try(final FeatureReader reader = getReaderFromPath(variantPath, variantIndexPath)) {
+ private VCFHeader getHeaderFromPath(final Path variantPath) {
+ //TODO make this mangling unecessary
+ final String variantURI = variantPath.toAbsolutePath().toUri().toString();
+ try(final FeatureReader reader = AbstractFeatureReader.getFeatureReader(variantURI, null, new VCFCodec(), false,
+ BucketUtils.getPrefetchingWrapper(cloudPrefetchBuffer),
+ BucketUtils.getPrefetchingWrapper(cloudIndexPrefetchBuffer))) {
return (VCFHeader) reader.getHeader();
} catch (final IOException e) {
throw new UserException("Error while reading vcf header from " + variantPath.toUri(), e);
@@ -633,9 +678,9 @@ private void writeIntervalListToFile() {
@Override
public void onTraversalStart() {
String workspaceDir = overwriteCreateOrCheckWorkspace();
- vidMapJSONFile = genomicsDBApppendPaths(workspaceDir, GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME);
- callsetMapJSONFile = genomicsDBApppendPaths(workspaceDir, GenomicsDBConstants.DEFAULT_CALLSETMAP_FILE_NAME);
- vcfHeaderFile = genomicsDBApppendPaths(workspaceDir, GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME);
+ vidMapJSONFile = GATKGenomicsDBUtils.genomicsDBApppendPaths(workspaceDir, GenomicsDBConstants.DEFAULT_VIDMAP_FILE_NAME);
+ callsetMapJSONFile = GATKGenomicsDBUtils.genomicsDBApppendPaths(workspaceDir, GenomicsDBConstants.DEFAULT_CALLSETMAP_FILE_NAME);
+ vcfHeaderFile = GATKGenomicsDBUtils.genomicsDBApppendPaths(workspaceDir, GenomicsDBConstants.DEFAULT_VCFHEADER_FILE_NAME);
if (getIntervalsFromExistingWorkspace) {
// intervals may be null if merge-contigs-into-num-partitions was used to create the workspace
// if so, we need to wait for vid to be generated before writing out the interval list
@@ -775,7 +820,7 @@ private List generateIntervalListFromWorkspace() {
final int start = Integer.parseInt(partitionInfo[1]);
final int end = Integer.parseInt(partitionInfo[2]);
return new SimpleInterval(contig, start, end);
- }).filter(o -> o != null).collect(Collectors.toList());
+ }).filter(Objects::nonNull).collect(Collectors.toList());
}
private ImportConfig createImportConfig(final int batchSize) {
@@ -785,7 +830,7 @@ private ImportConfig createImportConfig(final int batchSize) {
GenomicsDBImportConfiguration.ImportConfiguration.newBuilder();
importConfigurationBuilder.addAllColumnPartitions(partitions);
importConfigurationBuilder.setSizePerColumnPartition(vcfBufferSizePerSample);
- importConfigurationBuilder.setFailIfUpdating(true && !doIncrementalImport);
+ importConfigurationBuilder.setFailIfUpdating(!doIncrementalImport);
importConfigurationBuilder.setSegmentSize(segmentSize);
importConfigurationBuilder.setConsolidateTiledbArrayAfterLoad(doConsolidation);
importConfigurationBuilder.setEnableSharedPosixfsOptimizations(sharedPosixFSOptimizations);
@@ -936,7 +981,7 @@ private FeatureReader getReaderFromPath(final Path variantPath,
/* Anonymous FeatureReader subclass that wraps returned iterators to ensure that the GVCFs do not
* contain MNPs.
*/
- return new FeatureReader() {
+ return new FeatureReader<>() {
/** Iterator that asserts that variants are not MNPs. */
class NoMnpIterator implements CloseableTribbleIterator {
private final CloseableTribbleIterator inner;
@@ -971,7 +1016,8 @@ public VariantContext next() {
return new NoMnpIterator(reader.query(chr, start, end));
}
- @Override public CloseableTribbleIterator iterator() throws IOException {
+ @Override
+ public CloseableTribbleIterator iterator() throws IOException {
return new NoMnpIterator(reader.iterator());
}
};
@@ -990,7 +1036,7 @@ public VariantContext next() {
* @return The workspace directory
*/
private String overwriteCreateOrCheckWorkspace() {
- String workspaceDir = genomicsDBGetAbsolutePath(workspace);
+ String workspaceDir = GATKGenomicsDBUtils.genomicsDBGetAbsolutePath(workspace);
// From JavaDoc for GATKGenomicsDBUtils.createTileDBWorkspacevid
// returnCode = 0 : OK. If overwriteExistingWorkspace is true and the workspace exists, it is deleted first.
// returnCode = -1 : path was not a directory
@@ -1016,7 +1062,7 @@ private String overwriteCreateOrCheckWorkspace() {
}
static class UnableToCreateGenomicsDBWorkspace extends UserException {
- private static final long serialVersionUID = 1L;
+ @Serial private static final long serialVersionUID = 1L;
UnableToCreateGenomicsDBWorkspace(final String message){
super(message);
@@ -1028,7 +1074,7 @@ static class UnableToCreateGenomicsDBWorkspace extends UserException {
* dictionary (as returned by {@link #getBestAvailableSequenceDictionary})
* to parse/verify them. Does nothing if no intervals were specified.
*/
- protected void initializeIntervals() {
+ void initializeIntervals() {
if (intervalArgumentCollection.intervalsSpecified()) {
if (getIntervalsFromExistingWorkspace || doIncrementalImport) {
logger.warn(INCREMENTAL_WORKSPACE_ARG_LONG_NAME+" was set, so ignoring specified intervals." +
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/CommonCode.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/CommonCode.java
index 234fee68b3b..83f1048f535 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/CommonCode.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/CommonCode.java
@@ -3,7 +3,7 @@
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
import org.broadinstitute.hellbender.utils.genotyper.IndexedAlleleList;
import java.util.ArrayList;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java
index d5c1eafc248..304e385441c 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java
@@ -5,7 +5,7 @@
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
import org.apache.avro.generic.GenericRecord;
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.engine.FeatureContext;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortFilterRecord.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortFilterRecord.java
index a81a57d8be3..8ab3b6b9d6f 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortFilterRecord.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortFilterRecord.java
@@ -2,7 +2,7 @@
import htsjdk.samtools.util.Locatable;
import org.apache.avro.generic.GenericRecord;
-import org.apache.commons.lang.builder.ReflectionToStringBuilder;
+import org.apache.commons.lang3.builder.ReflectionToStringBuilder;
import org.broadinstitute.hellbender.tools.gvs.common.SchemaUtils;
public class ExtractCohortFilterRecord implements Locatable {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortRecord.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortRecord.java
index ffa1e44e77e..623c0678902 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortRecord.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortRecord.java
@@ -2,7 +2,7 @@
import htsjdk.samtools.util.Locatable;
import org.apache.avro.generic.GenericRecord;
-import org.apache.commons.lang.builder.ReflectionToStringBuilder;
+import org.apache.commons.lang3.builder.ReflectionToStringBuilder;
import org.broadinstitute.hellbender.tools.gvs.common.SchemaUtils;
import java.util.Objects;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ReferenceRecord.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ReferenceRecord.java
index 8f929262b7a..e5cda0ef47e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ReferenceRecord.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ReferenceRecord.java
@@ -3,7 +3,7 @@
import htsjdk.samtools.util.Locatable;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.util.Utf8;
-import org.apache.commons.lang.builder.ReflectionToStringBuilder;
+import org.apache.commons.lang3.builder.ReflectionToStringBuilder;
import org.broadinstitute.hellbender.tools.gvs.common.SchemaUtils;
public class ReferenceRecord implements Locatable, Comparable {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java
index 1919215d953..66644d3fc4b 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java
@@ -9,6 +9,7 @@
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
+import org.apache.parquet.schema.MessageTypeParser;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.DeprecatedFeature;
@@ -24,6 +25,14 @@
import org.broadinstitute.hellbender.utils.GenomeLocSortedSet;
import org.broadinstitute.hellbender.utils.IntervalUtils;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
+import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
+import static org.apache.parquet.schema.Type.Repetition.OPTIONAL;
+import static org.apache.parquet.schema.Type.Repetition.REQUIRED;
+
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.PrimitiveType;
+
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
@@ -175,6 +184,42 @@ public final class CreateVariantIngestFiles extends VariantWalker {
private final Set gqStatesToIgnore = new HashSet<>();
+ public final MessageType variantRowSchema = MessageTypeParser
+ .parseMessageType("""
+ message VariantRow {
+ required int64 sample_id;
+ required int64 location;
+ required binary ref (UTF8);
+ required binary alt (UTF8);
+ optional binary AS_RAW_MQ (UTF8);
+ optional binary AS_RAW_MQRankSum (UTF8);
+ optional binary AS_QUALapprox (UTF8);
+ optional binary AS_RAW_ReadPosRankSum (UTF8);
+ optional binary AS_SB_TABLE (UTF8);
+ optional binary AS_VarDP (UTF8);
+ required binary call_GT (UTF8);
+ optional binary call_AD (UTF8);
+ optional binary call_DP (UTF8);
+ required int64 call_GQ;
+ optional binary call_PGT (UTF8);
+ optional binary call_PID (UTF8);
+ optional binary call_PS (UTF8);
+ optional binary call_PL (UTF8);
+ }
+ """);
+
+ /* Not yet outputting ref_ranges rows */
+ public final MessageType refRangesRowSchema = MessageTypeParser
+ .parseMessageType("""
+ message VariantRow {
+ required int64 sample_id;
+ required int64 location;
+ required int64 length;
+ required binary state (UTF8);
+ }
+ """);
+
+
// getGenotypes() returns list of lists for all samples at variant
// assuming one sample per gvcf, getGenotype(0) retrieves GT for sample at index 0
public static boolean isNoCall(VariantContext variant) {
@@ -228,6 +273,148 @@ public void onTraversalStart() {
boolean vetRowsExist = false;
boolean vcfHeaderRowsExist = false;
+// List cols = variantRowSchema.getColumns();
+// for (int i = 0; i < cols.size(); ++i) {
+// ColumnDescriptor col = cols.get(i);
+// logger.info("col.getPath()");
+// logger.info(col.getPath());
+// logger.info("col.getPrimitiveType()");
+// logger.info(col.getPrimitiveType());
+// logger.info("col.getPrimitiveType().getName()");
+// logger.info(col.getPrimitiveType().getName());
+//
+// }
+
+
+ /* TEST CODE BELOW WORKS
+ java.nio.file.Path currentRelativePath = Paths.get("");
+ String s = currentRelativePath.toAbsolutePath().toString();
+ System.out.println("Current absolute path is: " + s);
+ File testParquetOutputFile = new File(s+ "/testFile.parquet");
+ System.out.println("Test file will be written to: " + testParquetOutputFile.getAbsolutePath());
+ try {
+ GvsVariantParquetFileWriter testWriter = new GvsVariantParquetFileWriter(new Path(testParquetOutputFile.toURI()), variantRowSchema, false, CompressionCodecName.SNAPPY);
+ // feed the file some test data
+ JSONArray testData = new JSONArray("""
+ [{"AS_QUALapprox":"25","AS_RAW_MQ":"0|0","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"25","alt":"C","call_AD":null,"call_GQ":"3","call_GT":"1/1","call_PGT":"0|1","call_PID":"26885022_G_C","call_PL":"25,3,0,25,3,25","location":"17000026885022","ref":"G","sample_id":"7"},
+ {"AS_QUALapprox":"45","AS_RAW_MQ":"0|0","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"45","alt":"T","call_AD":null,"call_GQ":"3","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"45,3,0,45,3,45","location":"15000101914931","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"0","AS_RAW_MQ":"0|43200","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"1","alt":"C","call_AD":"0,0","call_GQ":"0","call_GT":"0/1","call_PGT":"0|1","call_PID":"73839326_CAAA_C","call_PL":"1,0,0,1,0,1","location":"16000073839326","ref":"CAAA","sample_id":"7"},
+ {"AS_QUALapprox":"0","AS_RAW_MQ":"0|6894","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"1","alt":"A","call_AD":"0,0","call_GQ":"0","call_GT":"0/1","call_PGT":"0|1","call_PID":"66798833_ATGG_A","call_PL":"1,0,0,1,0,1","location":"17000066798833","ref":"ATGG","sample_id":"7"},
+ {"AS_QUALapprox":"0","AS_RAW_MQ":"0|18000","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"1","alt":"C","call_AD":"0,0","call_GQ":"0","call_GT":"0/1","call_PGT":null,"call_PID":null,"call_PL":"1,0,0,1,0,1","location":"16000080965202","ref":"CATATAT","sample_id":"7"},
+ {"AS_QUALapprox":"0","AS_RAW_MQ":"0|43200","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"1","alt":"C","call_AD":"0,0","call_GQ":"0","call_GT":"0/1","call_PGT":null,"call_PID":null,"call_PL":"1,0,0,1,0,1","location":"16000003489112","ref":"CTTTT","sample_id":"7"},
+ {"AS_QUALapprox":"0","AS_RAW_MQ":"0|45450","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"1","alt":"G","call_AD":"0,0","call_GQ":"0","call_GT":"0/1","call_PGT":null,"call_PID":null,"call_PL":"1,0,0,1,0,1","location":"16000046773931","ref":"GC","sample_id":"7"},
+ {"AS_QUALapprox":"0","AS_RAW_MQ":"0|28800","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"1","alt":"C","call_AD":"0,0","call_GQ":"0","call_GT":"0/1","call_PGT":null,"call_PID":null,"call_PL":"1,0,0,1,0,1","location":"15000070796035","ref":"CAAAAA","sample_id":"7"},
+ {"AS_QUALapprox":"0","AS_RAW_MQ":"0|36110","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"1","alt":"A","call_AD":"0,0","call_GQ":"0","call_GT":"0/1","call_PGT":"0|1","call_PID":"72146971_AG_A","call_PL":"1,0,0,1,1,1","location":"17000072146974","ref":"AGGAGG","sample_id":"7"},
+ {"AS_QUALapprox":"0","AS_RAW_MQ":"0|36110","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"1","alt":"A","call_AD":"0,0","call_GQ":"0","call_GT":"0/1","call_PGT":"0|1","call_PID":"72146971_AG_A","call_PL":"1,0,0,1,1,1","location":"17000072146971","ref":"AG","sample_id":"7"},
+ {"AS_QUALapprox":"0","AS_RAW_MQ":"0|54000","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"1","alt":"CGTCTCTCTG","call_AD":"0,0","call_GQ":"0","call_GT":"0/1","call_PGT":"0|1","call_PID":"87178619_CG_C","call_PL":"1,0,0,2,2,4","location":"16000087178644","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"0","AS_RAW_MQ":"0|55820","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"1","alt":"T","call_AD":"0,0","call_GQ":"0","call_GT":"0/1","call_PGT":"0|1","call_PID":"13467380_TGGG_T","call_PL":"1,0,0,3,2,5","location":"17000013467408","ref":"A","sample_id":"7"},
+ {"AS_QUALapprox":"1","AS_RAW_MQ":"0|7200","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"1","alt":"T","call_AD":"0,0","call_GQ":"1","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"1,1,0,1,1,1","location":"16000059226361","ref":"TATATATAATATAC","sample_id":"7"},
+ {"AS_QUALapprox":"2","AS_RAW_MQ":"0|142210","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,3","AS_VarDP":"0|0","QUALapprox":"2","alt":"CAA","call_AD":"0,0","call_GQ":"0","call_GT":"0/1","call_PGT":null,"call_PID":null,"call_PL":"2,0,0,1,1,1","location":"16000024911166","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"2","AS_RAW_MQ":"0|28289","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"2","alt":"T","call_AD":"0,0","call_GQ":"1","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"2,1,0,1,1,1","location":"15000100466812","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"2","AS_RAW_MQ":"0|44336","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"2","alt":"T","call_AD":"0,0","call_GQ":"1","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"2,1,0,2,1,1","location":"16000062126446","ref":"TG","sample_id":"7"},
+ {"AS_QUALapprox":"2","AS_RAW_MQ":"0|43825","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"2","alt":"C","call_AD":"0,0","call_GQ":"1","call_GT":"1/1","call_PGT":"0|1","call_PID":"17066723_CTTTTTTT_C","call_PL":"2,1,0,2,1,2","location":"17000017066723","ref":"CTTTTTTT","sample_id":"7"},
+ {"AS_QUALapprox":"2","AS_RAW_MQ":"0|47529","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"2","alt":"G","call_AD":"0,0","call_GQ":"1","call_GT":"1/1","call_PGT":"0|1","call_PID":"74109516_AAT_A","call_PL":"2,1,0,2,2,3","location":"16000074109606","ref":"T","sample_id":"7"},
+ {"AS_QUALapprox":"2","AS_RAW_MQ":"0|54000","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"2","alt":"C","call_AD":"0,0","call_GQ":"1","call_GT":"1/1","call_PGT":"0|1","call_PID":"87178619_CG_C","call_PL":"2,1,0,3,2,4","location":"16000087178640","ref":"CT","sample_id":"7"},
+ {"AS_QUALapprox":"3","AS_RAW_MQ":"0|4988","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"3","alt":"A","call_AD":"0,0","call_GQ":"1","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"3,1,0,3,1,3","location":"18000012870365","ref":"G","sample_id":"7"},
+ {"AS_QUALapprox":"3","AS_RAW_MQ":"0|25929","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"3","alt":"AAC","call_AD":"0,0","call_GQ":"1","call_GT":"1/1","call_PGT":"0|1","call_PID":"74109516_AAT_A","call_PL":"3,1,0,3,1,3","location":"16000074109638","ref":"A","sample_id":"7"},
+ {"AS_QUALapprox":"3","AS_RAW_MQ":"0|29529","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"3","alt":"A","call_AD":"0,0","call_GQ":"1","call_GT":"1/1","call_PGT":"0|1","call_PID":"74109516_AAT_A","call_PL":"3,1,0,3,1,3","location":"16000074109633","ref":"G","sample_id":"7"},
+ {"AS_QUALapprox":"3","AS_RAW_MQ":"0|43929","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"3","alt":"TTA","call_AD":"0,0","call_GQ":"1","call_GT":"1/1","call_PGT":"0|1","call_PID":"74109516_AAT_A","call_PL":"3,1,0,3,2,4","location":"16000074109622","ref":"T","sample_id":"7"},
+ {"AS_QUALapprox":"3","AS_RAW_MQ":"0|56334","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"3","alt":"A","call_AD":"0,0","call_GQ":"1","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"3,1,0,4,3,5","location":"17000013467441","ref":"T","sample_id":"7"},
+ {"AS_QUALapprox":"3","AS_RAW_MQ":"0|25929","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"3","alt":"AAT","call_AD":"0,0","call_GQ":"2","call_GT":"1/1","call_PGT":"0|1","call_PID":"74109516_AAT_A","call_PL":"3,2,0,3,2,3","location":"16000074109643","ref":"A","sample_id":"7"},
+ {"AS_QUALapprox":"3","AS_RAW_MQ":"0|57600","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"3","alt":"GCC","call_AD":"0,0","call_GQ":"2","call_GT":"1/1","call_PGT":"0|1","call_PID":"87178619_CG_C","call_PL":"3,2,0,4,2,4","location":"16000087178628","ref":"G","sample_id":"7"},
+ {"AS_QUALapprox":"4","AS_RAW_MQ":"0|72000","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"4","alt":"CTTTCTCCCTCCTT","call_AD":"0,0","call_GQ":"2","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"4,2,0,5,2,5","location":"16000087178618","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"4","AS_RAW_MQ":"0|61200","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"4","alt":"C","call_AD":"0,0","call_GQ":"2","call_GT":"1/1","call_PGT":"0|1","call_PID":"87178619_CG_C","call_PL":"4,2,0,5,3,5","location":"16000087178619","ref":"CG","sample_id":"7"},
+ {"AS_QUALapprox":"5","AS_RAW_MQ":"0|55521","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"5","alt":"T","call_AD":"0,0","call_GQ":"2","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"5,2,0,5,2,5","location":"15000079059222","ref":"TCCTTC","sample_id":"7"},
+ {"AS_QUALapprox":"6","AS_RAW_MQ":"0|55820","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"6","alt":"TTTCCC","call_AD":"0,0","call_GQ":"2","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"6,3,0,5,2,4","location":"17000013467376","ref":"T","sample_id":"7"},
+ {"AS_QUALapprox":"6","AS_RAW_MQ":"0|54000","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"6","alt":"A","call_AD":"0,0","call_GQ":"3","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"6,3,0,6,3,6","location":"17000009630990","ref":"AAAAGAAAG","sample_id":"7"},
+ {"AS_QUALapprox":"6","AS_RAW_MQ":"0|56947","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"6","alt":"T","call_AD":"0,0","call_GQ":"3","call_GT":"1/1","call_PGT":"0|1","call_PID":"71253570_TATATAC_T","call_PL":"6,3,0,6,4,7","location":"16000071253584","ref":"TATATAC","sample_id":"7"},
+ {"AS_QUALapprox":"6","AS_RAW_MQ":"0|53347","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"6","alt":"T","call_AD":"0,0","call_GQ":"3","call_GT":"1/1","call_PGT":"0|1","call_PID":"71253570_TATATAC_T","call_PL":"6,3,0,7,4,7","location":"16000071253570","ref":"TATATAC","sample_id":"7"},
+ {"AS_QUALapprox":"9","AS_RAW_MQ":"0|25200","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"9","alt":"TA","call_AD":"0,0","call_GQ":"2","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"9,2,0,9,3,9","location":"17000067472332","ref":"T","sample_id":"7"},
+ {"AS_QUALapprox":"16","AS_RAW_MQ":"0|33129","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"16","alt":"CTT","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"16,3,0,3,0,0","location":"17000030026961","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"17","AS_RAW_MQ":"0|32400","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"17","alt":"CTTT","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"17,3,0,3,0,0","location":"16000074081756","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"17","AS_RAW_MQ":"0|28800","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"17","alt":"CT","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"17,3,0,3,0,0","location":"16000077464450","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"17","AS_RAW_MQ":"0|18000","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"17","alt":"CAAAAAA","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"17,3,0,3,0,0","location":"16000003276234","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"17","AS_RAW_MQ":"0|39600","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"17","alt":"CTTTTTT","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"17,5,0,5,0,0","location":"16000030379640","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"18","AS_RAW_MQ":"0|35764","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"18","alt":"CA","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"18,3,0,3,0,0","location":"17000027579878","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"20","AS_RAW_MQ":"0|32400","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"20","alt":"C","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"20,3,0,3,0,0","location":"15000061980183","ref":"CAA","sample_id":"7"},
+ {"AS_QUALapprox":"23","AS_RAW_MQ":"0|46800","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"23","alt":"CTTTTTTTTTTTTTTTTT","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"23,1,0,1,0,0","location":"16000024669941","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"26","AS_RAW_MQ":"0|103504","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,2","AS_VarDP":"0|0","QUALapprox":"26","alt":"CAA","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"26,4,0,4,0,0","location":"17000039382092","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"27","AS_RAW_MQ":"0|43200","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"27","alt":"CTTTTT","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"27,2,0,2,0,0","location":"16000017514797","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"31","AS_RAW_MQ":"0|36000","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"31","alt":"T","call_AD":"0,0","call_GQ":"1","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"31,2,0,3,1,1","location":"16000087178667","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"34","AS_RAW_MQ":"0|28800","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"34","alt":"CA","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"34,3,0,3,0,0","location":"17000028822802","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"36","AS_RAW_MQ":"0|32400","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"36","alt":"CTTTT","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"36,3,0,3,0,0","location":"15000061236834","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"40","AS_RAW_MQ":"0|28800","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,2","AS_VarDP":"0|0","QUALapprox":"40","alt":"GA","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"40,6,0,6,0,0","location":"15000082302715","ref":"G","sample_id":"7"},
+ {"AS_QUALapprox":"42","AS_RAW_MQ":"0|32400","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"42","alt":"CTTTTTTTTTTTTTT","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"42,2,0,2,0,0","location":"16000030913975","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"44","AS_RAW_MQ":"0|18000","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"44","alt":"CAAAAAAAAAAAAAA","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"44,2,0,2,0,0","location":"17000013560289","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"44","AS_RAW_MQ":"0|18000","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"44","alt":"CAAAAAAAAAAA","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"44,3,0,3,0,0","location":"17000041879647","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"44","AS_RAW_MQ":"0|43200","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"44","alt":"CTTTT","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"44,3,0,3,0,0","location":"16000087731046","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"44","AS_RAW_MQ":"0|28800","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"44","alt":"CTTTTTTT","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"44,3,0,3,0,0","location":"16000057292424","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"45","AS_RAW_MQ":"0|4129","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"45","alt":"AATAT","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"45,3,0,3,0,0","location":"17000011812026","ref":"A","sample_id":"7"},
+ {"AS_QUALapprox":"51","AS_RAW_MQ":"0|39600","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"51","alt":"GTTTT","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"51,5,0,7,0,1","location":"16000075799147","ref":"G","sample_id":"7"},
+ {"AS_QUALapprox":"52","AS_RAW_MQ":"0|10800","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"52","alt":"GTT","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"52,8,0,8,0,0","location":"17000076388639","ref":"G","sample_id":"7"},
+ {"AS_QUALapprox":"54","AS_RAW_MQ":"0|21600","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"54","alt":"CAAAAAAAAAAAA","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"54,5,0,5,0,0","location":"16000074958592","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"62","AS_RAW_MQ":"0|32400","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"62","alt":"CAAAAAAAAA","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"62,5,0,5,0,0","location":"17000027903859","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"63","AS_RAW_MQ":"0|10800","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"63","alt":"ATATATATTTT","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"63,6,0,6,0,0","location":"17000035720573","ref":"A","sample_id":"7"},
+ {"AS_QUALapprox":"71","AS_RAW_MQ":"0|54000","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"71","alt":"CTTTTTTT","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"71,5,0,5,0,0","location":"15000066658402","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"72","AS_RAW_MQ":"0|31609","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"72","alt":"CTTTTTTT","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"72,5,0,5,0,0","location":"16000073507485","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"81","AS_RAW_MQ":"0|25929","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"81","alt":"CAAAAAAAAAAAAAAAAAAA","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"81,4,0,4,0,0","location":"17000007233087","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"81","AS_RAW_MQ":"0|32400","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"81","alt":"CTTTTTTTTTTTTTTTTTTTTT","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"81,4,0,4,0,0","location":"16000072688811","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"85","AS_RAW_MQ":"0|36000","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"85","alt":"GTTTTTTTT","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"85,7,0,7,0,0","location":"17000030138909","ref":"G","sample_id":"7"},
+ {"AS_QUALapprox":"89","AS_RAW_MQ":"0|25200","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"89","alt":"CAA","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"89,5,0,6,0,0","location":"16000074086867","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"128","AS_RAW_MQ":"0|14400","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"128","alt":"CTTTTTTTTTTTTTTTTTTTTTTTTTT","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"128,7,0,8,0,0","location":"16000077805865","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"133","AS_RAW_MQ":"0|18000","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"133","alt":"CTTTTTTTTTTTTTTT","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"133,8,0,8,0,0","location":"16000005651040","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"134","AS_RAW_MQ":"0|14400","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"134","alt":"CTTTTTTTTT","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"134,9,0,9,0,0","location":"16000082637428","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"18","AS_RAW_MQ":"0|21600","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"18","alt":"GTTTTTTTTT","call_AD":"0,0","call_GQ":"2","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"18,2,0,11,2,8","location":"17000030878718","ref":"G","sample_id":"7"},
+ {"AS_QUALapprox":"10","AS_RAW_MQ":"0|68400","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"10","alt":"T","call_AD":"0,0","call_GQ":"5","call_GT":"1/1","call_PGT":"0|1","call_PID":"10782225_A_T","call_PL":"10,5,0,10,5,10","location":"18000010782336","ref":"A","sample_id":"7"},
+ {"AS_QUALapprox":"11","AS_RAW_MQ":"0|40221","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"11","alt":"GTA","call_AD":"0,0","call_GQ":"5","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"11,5,0,11,5,11","location":"17000013467357","ref":"G","sample_id":"7"},
+ {"AS_QUALapprox":"11","AS_RAW_MQ":"0|61200","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"11","alt":"A","call_AD":"0,0","call_GQ":"5","call_GT":"1/1","call_PGT":"0|1","call_PID":"5308090_ATATATATACCTATACATATATG_A","call_PL":"11,5,0,11,5,11","location":"18000005308090","ref":"ATATATATACCTATACATATATG","sample_id":"7"},
+ {"AS_QUALapprox":"13","AS_RAW_MQ":"0|60570","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"13","alt":"A","call_AD":"0,0","call_GQ":"6","call_GT":"1/1","call_PGT":"0|1","call_PID":"92047557_TAA_T","call_PL":"13,6,0,13,6,13","location":"15000092047583","ref":"AT","sample_id":"7"},
+ {"AS_QUALapprox":"13","AS_RAW_MQ":"0|60570","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"13","alt":"T","call_AD":"0,0","call_GQ":"6","call_GT":"1/1","call_PGT":"0|1","call_PID":"92047557_TAA_T","call_PL":"13,6,0,13,6,13","location":"15000092047588","ref":"TATATA","sample_id":"7"},
+ {"AS_QUALapprox":"16","AS_RAW_MQ":"0|54000","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"16","alt":"CTTTTTTTTTTTTTTT","call_AD":"0,0","call_GQ":"1","call_GT":"1/1","call_PGT":"0|1","call_PID":"67984630_C_CTTTTTTTTTTTTTTT","call_PL":"16,1,0,17,3,19","location":"17000067984630","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"17","AS_RAW_MQ":"0|43929","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"17","alt":"C","call_AD":"0,0","call_GQ":"7","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"17,7,0,17,7,17","location":"16000013990612","ref":"CAT","sample_id":"7"},
+ {"AS_QUALapprox":"170","AS_RAW_MQ":"0|55170","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"170","alt":"CCCTTCCCTAT","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"170,9,0,10,0,0","location":"15000079059257","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"18","AS_RAW_MQ":"0|18000","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"18","alt":"CAA","call_AD":"0,0","call_GQ":"3","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"18,3,0,18,3,18","location":"16000000713318","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"27","AS_RAW_MQ":"0|18000","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"27","alt":"ATTTTTTTTTTTTTTT","call_AD":"0,0","call_GQ":"2","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"27,2,0,28,3,29","location":"16000062098154","ref":"A","sample_id":"7"},
+ {"AS_QUALapprox":"27","AS_RAW_MQ":"0|22337","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"27","alt":"CAAA","call_AD":"0,0","call_GQ":"3","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"27,3,0,27,3,27","location":"16000032126382","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"28","AS_RAW_MQ":"0|28800","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"28","alt":"CAA","call_AD":"0,0","call_GQ":"3","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"28,3,0,21,3,18","location":"17000006238053","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"36","AS_RAW_MQ":"0|54000","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"36","alt":"CAAAAAA","call_AD":"0,0","call_GQ":"2","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"36,2,0,37,3,38","location":"17000005494594","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"39","AS_RAW_MQ":"0|40129","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"39","alt":"CAAAAAAAA","call_AD":"0,0","call_GQ":"1","call_GT":"0/1","call_PGT":null,"call_PID":null,"call_PL":"39,0,3,16,1,14","location":"17000020360227","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"41","AS_RAW_MQ":"0|21600","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"41","alt":"CAAAAAAAAAAAAAAA","call_AD":"0,0","call_GQ":"1","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"41,1,0,17,3,16","location":"16000071736098","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"43","AS_RAW_MQ":"0|18000","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"43","alt":"GTTTTTTTTTTTTTTTT","call_AD":"0,0","call_GQ":"2","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"43,2,0,44,3,45","location":"17000019526269","ref":"G","sample_id":"7"},
+ {"AS_QUALapprox":"45","AS_RAW_MQ":"0|3600","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"45","alt":"A","call_AD":"0,0","call_GQ":"3","call_GT":"1/1","call_PGT":"0|1","call_PID":"89603914_GA_G","call_PL":"45,3,0,45,3,45","location":"16000089603975","ref":"G","sample_id":"7"},
+ {"AS_QUALapprox":"77","AS_RAW_MQ":"0|18000","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"77","alt":"CAAAAAAAAAAAAAAAAAAAAAA","call_AD":"0,0","call_GQ":"8","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"77,8,0,78,9,79","location":"17000012602773","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"8","AS_RAW_MQ":"0|63171","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"8","alt":"CGCTGAGCAGTGTGTGGGGTGGAGCGGTTGCTGACCGGGGCGCTGAGCAGTGTGTGGGGTGGAGCGGGTGTTGACTGGGGT","call_AD":"0,0","call_GQ":"3","call_GT":"1/1","call_PGT":"0|1","call_PID":"89898371_C_CGCTGAGCAGTGTGTGGGGTGGAGCGGTTGCTGACCGGGGCGCTGAGCAGTGTGTGGGGTGGAGCGGGTGTTGACTGGGGT","call_PL":"8,3,0,14,10,22","location":"16000089898371","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"81","AS_RAW_MQ":"0|45555","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"81","alt":"AATATAAAATAATTACATATTATATATTTATATATAAT","call_AD":"0,0","call_GQ":"4","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"81,4,0,84,7,87","location":"16000065128304","ref":"A","sample_id":"7"},
+ {"AS_QUALapprox":"88","AS_RAW_MQ":"0|16401","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"88","alt":"GAAAAAAAAAAAAAA","call_AD":"0,0","call_GQ":"5","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"88,5,0,89,6,90","location":"17000043305380","ref":"G","sample_id":"7"},
+ {"AS_QUALapprox":"88","AS_RAW_MQ":"0|36784","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"88","alt":"GTTTTTTTTTTTTT","call_AD":"0,0","call_GQ":"5","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"88,5,0,89,6,90","location":"16000083080871","ref":"G","sample_id":"7"},
+ {"AS_QUALapprox":"98","AS_RAW_MQ":"0|56809","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"98","alt":"CAAAAAAAA","call_AD":"0,0","call_GQ":"2","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"98,8,0,32,2,24","location":"15000077053451","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"161","AS_RAW_MQ":"0|46800","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"161","alt":"CTTTTTTTTTTTTTTTTTT","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"161,11,0,11,0,0","location":"17000056368276","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"179","AS_RAW_MQ":"0|18000","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"179","alt":"TC","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"179,15,0,15,0,0","location":"16000023902800","ref":"T","sample_id":"7"},
+ {"AS_QUALapprox":"199","AS_RAW_MQ":"0|43200","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"199","alt":"CAAAAAAAAAAAAAAAAA","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"199,13,0,14,0,1","location":"17000028353520","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"223","AS_RAW_MQ":"0|39600","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"223","alt":"CATATATAT","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"223,14,0,14,0,0","location":"17000074365044","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"24","AS_RAW_MQ":"0|18000","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"24","alt":"CAAAAAAAAAAAAA","call_AD":"0,0","call_GQ":"1","call_GT":"0/1","call_PGT":null,"call_PID":null,"call_PL":"24,0,40,24,1,25","location":"18000005328386","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"250","AS_RAW_MQ":"0|34301","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"250","alt":"CCAAA","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"250,27,0,27,0,0","location":"16000012606864","ref":"C","sample_id":"7"},
+ {"AS_QUALapprox":"266","AS_RAW_MQ":"0|43200","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"266","alt":"G","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"266,29,0,30,0,1","location":"15000061980212","ref":"A","sample_id":"7"},
+ {"AS_QUALapprox":"276","AS_RAW_MQ":"0|33625","AS_RAW_MQRankSum":null,"AS_RAW_ReadPosRankSum":null,"AS_SB_TABLE":"0,0|0,0","AS_VarDP":"0|0","QUALapprox":"276","alt":"GT","call_AD":"0,0","call_GQ":"0","call_GT":"1/1","call_PGT":null,"call_PID":null,"call_PL":"276,24,0,25,0,1","location":"15000062888285","ref":"G","sample_id":"7"}
+ ]
+ """);
+ for (int i = 0; i < testData.length(); i++)
+ {
+ JSONObject jsonObj = testData.getJSONObject(i);
+ testWriter.write(jsonObj);
+ }
+ testWriter.close();
+ } catch (IOException exception) {
+ exception.printStackTrace();
+ System.exit(1);
+ }
+
+
+
+ logger.info("Exiting early, trying to set up Parquet writing");
+ System.exit(0);
+ */
+
// If BQ, check the load status table to see if this sample has already been loaded.
if (outputType == CommonCode.OutputType.BQ) {
loadStatus = new LoadStatus(projectID, datasetName, loadStatusTableName);
@@ -295,7 +482,7 @@ public void onTraversalStart() {
}
if (enableVet && !vetRowsExist) {
- vetCreator = new VetCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, outputDir, outputType, projectID, datasetName, forceLoadingFromNonAlleleSpecific, skipLoadingVqsrFields);
+ vetCreator = new VetCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, outputDir, outputType, projectID, datasetName, forceLoadingFromNonAlleleSpecific, skipLoadingVqsrFields, variantRowSchema);
}
if (enableVCFHeaders && !vcfHeaderRowsExist) {
buildAllVcfLineHeaders();
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VetCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VetCreator.java
index aec23295f7d..7adf61a09ce 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VetCreator.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VetCreator.java
@@ -3,12 +3,16 @@
import com.google.protobuf.Descriptors;
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.hadoop.metadata.CompressionCodecName;
+import org.apache.parquet.schema.MessageType;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.gvs.common.CommonCode;
import org.broadinstitute.hellbender.tools.gvs.common.IngestConstants;
import org.broadinstitute.hellbender.tools.gvs.common.SchemaUtils;
import org.broadinstitute.hellbender.utils.gvs.bigquery.BigQueryUtils;
import org.broadinstitute.hellbender.utils.gvs.bigquery.PendingBQWriter;
+import org.broadinstitute.hellbender.utils.gvs.parquet.GvsVariantParquetFileWriter;
import org.broadinstitute.hellbender.utils.tsv.SimpleXSVWriter;
import org.json.JSONObject;
@@ -26,6 +30,7 @@ public class VetCreator {
private SimpleXSVWriter vetWriter = null;
private final Long sampleId;
private PendingBQWriter vetBQJsonWriter = null;
+ private GvsVariantParquetFileWriter vetParquetFileWriter = null;
private final boolean forceLoadingFromNonAlleleSpecific;
private final boolean skipLoadingVqsrFields;
@@ -36,7 +41,7 @@ public static boolean doRowsExistFor(CommonCode.OutputType outputType, String pr
return BigQueryUtils.doRowsExistFor(projectId, datasetName, VET_FILETYPE_PREFIX + tableNumber, SchemaUtils.SAMPLE_ID_FIELD_NAME, sampleId);
}
- public VetCreator(String sampleIdentifierForOutputFileName, Long sampleId, String tableNumber, final File outputDirectory, final CommonCode.OutputType outputType, final String projectId, final String datasetName, final boolean forceLoadingFromNonAlleleSpecific, final boolean skipLoadingVqsrFields) {
+ public VetCreator(String sampleIdentifierForOutputFileName, Long sampleId, String tableNumber, final File outputDirectory, final CommonCode.OutputType outputType, final String projectId, final String datasetName, final boolean forceLoadingFromNonAlleleSpecific, final boolean skipLoadingVqsrFields, final MessageType parquetSchema) {
this.sampleId = sampleId;
this.outputType = outputType;
this.forceLoadingFromNonAlleleSpecific = forceLoadingFromNonAlleleSpecific;
@@ -57,6 +62,11 @@ public VetCreator(String sampleIdentifierForOutputFileName, Long sampleId, Strin
final File vetOutputFile = new File(outputDirectory, VET_FILETYPE_PREFIX + tableNumber + PREFIX_SEPARATOR + sampleIdentifierForOutputFileName + IngestConstants.FILETYPE);
vetWriter = new SimpleXSVWriter(vetOutputFile.toPath(), IngestConstants.SEPARATOR);
vetWriter.setHeaderLine(getHeaders());
+ break;
+ case PARQUET:
+ final File parquetOutputFile = new File(outputDirectory, VET_FILETYPE_PREFIX + tableNumber + PREFIX_SEPARATOR + sampleIdentifierForOutputFileName + ".parquet");
+ vetParquetFileWriter = new GvsVariantParquetFileWriter(new Path(parquetOutputFile.toURI()), parquetSchema, CompressionCodecName.SNAPPY);
+ break;
}
} catch (final IOException ioex) {
throw new UserException("Could not create vet outputs", ioex);
@@ -84,6 +94,9 @@ public void apply(VariantContext variant) throws IOException {
);
vetWriter.getNewLineBuilder().setRow(row).write();
break;
+ case PARQUET:
+ vetParquetFileWriter.write(createJson(location, variant, sampleId));
+ break;
}
}
@@ -128,7 +141,16 @@ public void commitData() {
if (outputType == CommonCode.OutputType.BQ && vetBQJsonWriter != null) {
vetBQJsonWriter.flushBuffer();
vetBQJsonWriter.commitWriteStreams();
+ } else if (outputType == CommonCode.OutputType.PARQUET && vetParquetFileWriter != null) {
+ try {
+ vetParquetFileWriter.close();
+ } catch (IOException exception) {
+ System.out.println("ERROR CLOSING PARQUET FILE: ");
+ exception.printStackTrace();
+ }
}
+
+
}
public void closeTool() {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VetFieldEnum.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VetFieldEnum.java
index a82ad989d26..6df437f01e2 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VetFieldEnum.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/VetFieldEnum.java
@@ -4,7 +4,7 @@
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.exceptions.UserException;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java
index de6a914e42c..f1f71a6eb7f 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/CreateHadoopBamSplittingIndex.java
@@ -1,5 +1,6 @@
package org.broadinstitute.hellbender.tools.spark;
+import com.google.common.io.Files;
import htsjdk.samtools.*;
import htsjdk.samtools.BAMSBIIndexer;
import htsjdk.samtools.seekablestream.SeekableFileStream;
@@ -18,7 +19,6 @@
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.read.ReadConstants;
-import org.codehaus.plexus.util.FileUtils;
import picard.cmdline.programgroups.OtherProgramGroup;
import java.io.*;
@@ -166,7 +166,7 @@ private static void assertBamIsCoordinateSorted(final SAMFileHeader header) {
private static void assertIsBam(final File inputBam) {
if(!BamFileIoUtils.isBamFile(inputBam)) {
throw new UserException.BadInput("A splitting index is only relevant for a bam file, but a "
- + "file with extension "+ FileUtils.getExtension(inputBam.getName()) + " was specified.");
+ + "file with extension "+ Files.getFileExtension(inputBam.getName()) + " was specified.");
}
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java
index 95bb083f03b..de91be5e5b6 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java
@@ -89,7 +89,10 @@ public enum ComplexVariantSubtype {
piDUP_RF,
dDUP,
dDUP_iDEL,
- INS_iDEL
+ INS_iDEL,
+ CTX_PP_QQ,
+ CTX_PQ_QP,
+ CTX_INV
}
// not defined in output vcf header but used in internal id that is currently output in the ID column
@@ -163,6 +166,7 @@ public enum ComplexVariantSubtype {
public static final String NONCODING_BREAKPOINT = "PREDICTED_NONCODING_BREAKPOINT";
public static final String NEAREST_TSS = "PREDICTED_NEAREST_TSS";
public static final String TSS_DUP = "PREDICTED_TSS_DUP";
+ public static final String PARTIAL_DISPERSED_DUP = "PREDICTED_PARTIAL_DISPERSED_DUP";
// SVTYPE classes
public enum StructuralVariantAnnotationType {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVFastqUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVFastqUtils.java
index 6c016f84436..2f51a7df114 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVFastqUtils.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVFastqUtils.java
@@ -4,7 +4,7 @@
import com.esotericsoftware.kryo.Kryo;
import com.esotericsoftware.kryo.io.Input;
import com.esotericsoftware.kryo.io.Output;
-import com.netflix.servo.util.VisibleForTesting;
+import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.*;
import htsjdk.samtools.util.Locatable;
import htsjdk.samtools.util.SequenceUtil;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java
index 8edcd595881..0f95878b389 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java
@@ -132,8 +132,12 @@ private void validateCoordinates(final SAMSequenceDictionary dictionary) {
Utils.nonNull(dictionary);
validatePosition(contigA, positionA, dictionary);
validatePosition(contigB, positionB, dictionary);
- Utils.validateArg(IntervalUtils.compareLocatables(getPositionAInterval(), getPositionBInterval(), dictionary) <= 0,
- "End coordinate cannot precede start");
+ // CPX types may have position B precede A, such as dispersed duplications where A is the insertion point and
+ // B references the source sequence.
+ if (type != GATKSVVCFConstants.StructuralVariantAnnotationType.CPX) {
+ Utils.validateArg(IntervalUtils.compareLocatables(getPositionAInterval(), getPositionBInterval(), dictionary) <= 0,
+ "End coordinate cannot precede start");
+ }
}
private static void validatePosition(final String contig, final int position, final SAMSequenceDictionary dictionary) {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/concordance/ClosestSVFinder.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/concordance/ClosestSVFinder.java
index da4e66f3fb6..9ca5056a679 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/sv/concordance/ClosestSVFinder.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/concordance/ClosestSVFinder.java
@@ -31,6 +31,7 @@
*/
public class ClosestSVFinder {
+ protected final boolean sortOutput;
protected final Map truthIdToItemMap;
protected final Map idToClusterMap;
private final SVConcordanceLinkage linkage;
@@ -49,7 +50,9 @@ public class ClosestSVFinder {
*/
public ClosestSVFinder(final SVConcordanceLinkage linkage,
final Function collapser,
+ final boolean sortOutput,
final SAMSequenceDictionary dictionary) {
+ this.sortOutput = sortOutput;
this.linkage = Utils.nonNull(linkage);
this.collapser = Utils.nonNull(collapser);
outputBuffer = new PriorityQueue<>(SVCallRecordUtils.getCallComparator(dictionary));
@@ -60,6 +63,15 @@ public ClosestSVFinder(final SVConcordanceLinkage linkage,
lastItemContig = null;
}
+ /**
+ * Sorts output by default
+ */
+ public ClosestSVFinder(final SVConcordanceLinkage linkage,
+ final Function collapser,
+ final SAMSequenceDictionary dictionary) {
+ this(linkage, collapser, true, dictionary);
+ }
+
/**
* Should be run frequently to reduce memory usage. Forced flushes must be run when a new contig is encountered.
* @param force flushes all variants in the output buffer regardless of position
@@ -70,8 +82,12 @@ public List flush(final boolean force) {
.map(c -> new ClosestPair(c.getItem(), c.getClosest()))
.map(collapser)
.collect(Collectors.toList());
- outputBuffer.addAll(collapsedRecords);
- return flushBuffer(force);
+ if (sortOutput) {
+ outputBuffer.addAll(collapsedRecords);
+ return flushBuffer(force);
+ } else {
+ return collapsedRecords;
+ }
}
/**
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReadAnonymizer.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReadAnonymizer.java
index 6c36592b6ae..16b5142fb84 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReadAnonymizer.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReadAnonymizer.java
@@ -4,7 +4,7 @@
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import org.aeonbits.owner.util.Collections;
-import org.apache.commons.lang.ArrayUtils;
+import org.apache.commons.lang3.ArrayUtils;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.ExperimentalFeature;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java
index b0ff4985cec..1d9949b219e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java
@@ -3,7 +3,7 @@
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.*;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.read.GATKRead;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java
index 931a6bb5256..7045f4678ec 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java
@@ -4,7 +4,7 @@
import htsjdk.samtools.util.Locatable;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
-import org.apache.commons.lang.mutable.MutableInt;
+import org.apache.commons.lang3.mutable.MutableInt;
import org.broadinstitute.barclay.argparser.Argument;
import org.apache.commons.lang3.tuple.Triple;
import org.broadinstitute.barclay.help.DocumentedFeature;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/OrientationBiasReadCounts.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/OrientationBiasReadCounts.java
index addbd29dfd7..96a749de50c 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/OrientationBiasReadCounts.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/OrientationBiasReadCounts.java
@@ -4,7 +4,7 @@
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.VariantContext;
-import org.apache.commons.lang.mutable.MutableInt;
+import org.apache.commons.lang3.mutable.MutableInt;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/ReferenceBases.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/ReferenceBases.java
index 9c2d99b97b7..3ef5ade7148 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/ReferenceBases.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/ReferenceBases.java
@@ -4,7 +4,7 @@
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.utils.Utils;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_QualByDepth.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_QualByDepth.java
index 9d78efd6d1b..f8bb3b3cea9 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_QualByDepth.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_QualByDepth.java
@@ -7,7 +7,7 @@
import htsjdk.variant.vcf.VCFCompoundHeaderLine;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.tools.walkers.annotator.AnnotationUtils;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_RankSumTest.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_RankSumTest.java
index 5b4af7ec0ff..ef01d6fbf62 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_RankSumTest.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_RankSumTest.java
@@ -5,7 +5,7 @@
import htsjdk.variant.variantcontext.GenotypesContext;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
-import org.apache.commons.lang.ArrayUtils;
+import org.apache.commons.lang3.ArrayUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.engine.ReferenceContext;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/FlowAnnotatorBase.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/FlowAnnotatorBase.java
index 0c4a07f015a..acf1c74bf9c 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/FlowAnnotatorBase.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/FlowAnnotatorBase.java
@@ -4,6 +4,7 @@
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.walkers.annotator.InfoFieldAnnotation;
@@ -149,7 +150,7 @@ private String establishReadGroupFlowOrder(final LocalContext localContext, fina
// if here, no flow order was found. may we use a default?
if ( isActualFlowOrderRequired() ) {
localContext.generateAnnotation = false;
- flowMissingOneShotLogger.warn("this.getClass().getSimpleName() + \" annotation will not be calculated, no '\" + StandardArgumentDefinitions.FLOW_ORDER_FOR_ANNOTATIONS + \"' argument provided\"");
+ flowMissingOneShotLogger.warn(this.getClass().getSimpleName() + " annotation will not be calculated, no '" + StandardArgumentDefinitions.FLOW_ORDER_FOR_ANNOTATIONS + "' argument provided");
}
return FlowBasedRead.DEFAULT_FLOW_ORDER;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationModel.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationModel.java
index 44b07f6fcfe..93420133913 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationModel.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationModel.java
@@ -1,7 +1,7 @@
package org.broadinstitute.hellbender.tools.walkers.contamination;
import htsjdk.samtools.util.OverlapDetector;
-import org.apache.commons.lang.mutable.MutableDouble;
+import org.apache.commons.lang3.mutable.MutableDouble;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.math3.optim.univariate.UnivariatePointValuePair;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationSegmenter.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationSegmenter.java
index 1a023488270..607f6691c58 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationSegmenter.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/ContaminationSegmenter.java
@@ -13,7 +13,7 @@
import java.util.stream.IntStream;
public class ContaminationSegmenter {
- public static final Range ALT_FRACTIONS_FOR_SEGMENTATION = Range.between(0.1, 0.9);
+ public static final Range ALT_FRACTIONS_FOR_SEGMENTATION = Range.of(0.1, 0.9);
public static final double KERNEL_SEGMENTER_LINEAR_COST = 1.0;
public static final double KERNEL_SEGMENTER_LOG_LINEAR_COST = 1.0;
public static final int KERNEL_SEGMENTER_DIMENSION = 100;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapper.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapper.java
index b3ddc3e21b6..2014ce3030e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapper.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapper.java
@@ -2,12 +2,14 @@
import htsjdk.samtools.*;
import htsjdk.samtools.util.Locatable;
+import htsjdk.samtools.util.SequenceUtil;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.writer.Options;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.*;
+import org.apache.commons.lang3.StringUtils;
import org.apache.commons.math3.util.Precision;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
@@ -81,6 +83,19 @@
@ExperimentalFeature
public final class FlowFeatureMapper extends ReadWalker {
+ static class CopyAttrInfo {
+ public final String name;
+ public final VCFHeaderLineType type;
+ public final String desc;
+
+ public CopyAttrInfo(final String spec) {
+ final String[] toks = spec.split(",");
+ name = toks[0];
+ type = toks.length > 1 ? VCFHeaderLineType.valueOf(toks[1]) : VCFHeaderLineType.String;
+ desc = toks.length > 2 ? StringUtils.join(Arrays.copyOfRange(toks, 2, toks.length), ",") : ("copy-attr: " + name);
+ }
+ }
+
private static final Logger logger = LogManager.getLogger(FlowFeatureMapper.class);
private static final String VCB_SOURCE = "fm";
@@ -101,8 +116,18 @@ public final class FlowFeatureMapper extends ReadWalker {
private static final String VCF_SMQ_RIGHT = "X_SMQ_RIGHT";
private static final String VCF_SMQ_LEFT_MEAN = "X_SMQ_LEFT_MEAN";
private static final String VCF_SMQ_RIGHT_MEAN = "X_SMQ_RIGHT_MEAN";
+ private static final String VCF_ADJACENT_REF_DIFF = "X_ADJACENT_REF_DIFF";
+
private static final Double LOWEST_PROB = 0.0001;
+ private static final int VENDOR_QUALITY_CHECK_FLAG = 0x200;
+
+ private static final String INCLUDE_QC_FAILED_READS_FULL_NAME = "include-qc-failed-reads";
+
+ final private List copyAttrInfo = new LinkedList<>();
+
+ // order here is according to SequenceUtil.VALID_BASES_UPPER
+ final private String scoreForBaseNames[] = new String[SequenceUtil.VALID_BASES_UPPER.length];
@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
@@ -131,6 +156,10 @@ public final class FlowFeatureMapper extends ReadWalker {
@Argument(fullName=HaplotypeCallerArgumentCollection.OUTPUT_BLOCK_LOWER_BOUNDS, doc = "Output the band lower bound for each GQ block regardless of the data it represents", optional = true)
public boolean floorBlocks = false;
+ @Advanced
+ @Argument(fullName=INCLUDE_QC_FAILED_READS_FULL_NAME, doc = "include reads with QC failed flag", optional = true)
+ public boolean includeQcFailedReads = false;
+
@ArgumentCollection
public FlowBasedArgumentCollection fbargs = new FlowBasedArgumentCollection();
@@ -175,6 +204,9 @@ protected static class MappedFeature implements Comparable {
int smqLeftMean;
int smqRightMean;
+ double scoreForBase[];
+ boolean adjacentRefDiff;
+
public MappedFeature(GATKRead read, FlowFeatureMapperArgumentCollection.MappingFeatureEnum type, byte[] readBases,
byte[] refBases, int readBasesOffset, int start, int offsetDelta) {
this.read = read;
@@ -315,8 +347,20 @@ public VCFHeader makeVCFHeader(final SAMSequenceDictionary sequenceDictionary, f
headerInfo.add(new VCFInfoHeaderLine(VCF_SMQ_RIGHT, 1, VCFHeaderLineType.Integer, "Ordinal Median quality of N bases to the right of the feature"));
headerInfo.add(new VCFInfoHeaderLine(VCF_SMQ_LEFT_MEAN, 1, VCFHeaderLineType.Integer, "Ordinal Mean quality of N bases to the left of the feature"));
headerInfo.add(new VCFInfoHeaderLine(VCF_SMQ_RIGHT_MEAN, 1, VCFHeaderLineType.Integer, "Ordinal Mean quality of N bases to the right of the feature"));
- for ( String name : fmArgs.copyAttr ) {
- headerInfo.add(new VCFInfoHeaderLine(fmArgs.copyAttrPrefix + name, 1, VCFHeaderLineType.String, "copy-attr: " + name));
+ for ( String spec : fmArgs.copyAttr ) {
+ final CopyAttrInfo info = new CopyAttrInfo(spec);
+ headerInfo.add(new VCFInfoHeaderLine(fmArgs.copyAttrPrefix + info.name, 1, info.type, info.desc));
+ copyAttrInfo.add(info);
+ }
+
+ // validation mode?
+ if ( fmArgs.reportAllAlts ) {
+ for ( int baseIndex = 0 ; baseIndex < SequenceUtil.VALID_BASES_UPPER.length ; baseIndex++ ) {
+ headerInfo.add(new VCFInfoHeaderLine(scoreNameForBase(baseIndex), 1, VCFHeaderLineType.Float, "Base specific mapping score"));
+ }
+ }
+ if ( fmArgs.reportAllAlts || fmArgs.tagBasesWithAdjacentRefDiff ) {
+ headerInfo.add(new VCFInfoHeaderLine(VCF_ADJACENT_REF_DIFF, 1, VCFHeaderLineType.Flag, "Adjacent base filter indication: indel in the adjacent 5 bases to the considered base on the read"));
}
final VCFHeader vcfHeader = new VCFHeader(headerInfo);
@@ -324,6 +368,13 @@ public VCFHeader makeVCFHeader(final SAMSequenceDictionary sequenceDictionary, f
return vcfHeader;
}
+ private String scoreNameForBase(int baseIndex) {
+ if ( scoreForBaseNames[baseIndex] == null ) {
+ scoreForBaseNames[baseIndex] = VCF_SCORE + "_" + new String(new byte[]{SequenceUtil.VALID_BASES_UPPER[baseIndex]});
+ }
+ return scoreForBaseNames[baseIndex];
+ }
+
@Override
public void apply(final GATKRead read, final ReferenceContext referenceContext, final FeatureContext featureContext) {
@@ -337,6 +388,11 @@ public void apply(final GATKRead read, final ReferenceContext referenceContext,
return;
}
+ // include qc-failed reads?
+ if ( ((read.getFlags() & VENDOR_QUALITY_CHECK_FLAG) != 0) && !includeQcFailedReads ) {
+ return;
+ }
+
// flush qeues up to this read
flushQueue(read, referenceContext);
@@ -349,6 +405,20 @@ public void apply(final GATKRead read, final ReferenceContext referenceContext,
// score the feature
fr.score = scoreFeature(fr);
+ // score for validation mode?
+ if ( fmArgs.reportAllAlts) {
+ fr.scoreForBase = new double[SequenceUtil.VALID_BASES_UPPER.length];
+ for ( int baseIndex = 0 ; baseIndex < fr.scoreForBase.length ; baseIndex++ ) {
+ final byte base = SequenceUtil.VALID_BASES_UPPER[baseIndex];
+ boolean incl = base != fr.readBases[0];
+ if ( incl ) {
+ fr.scoreForBase[baseIndex] = scoreFeature(fr, base);
+ } else {
+ fr.scoreForBase[baseIndex] = Double.NaN;
+ }
+ }
+ }
+
// emit feature if filters in
if ( filterFeature(fr) ) {
featureQueue.add(fr);
@@ -413,14 +483,17 @@ private void enrichFeature(final MappedFeature fr) {
}
private double scoreFeature(final MappedFeature fr) {
+ return scoreFeature(fr, (byte)0);
+ }
+ private double scoreFeature(final MappedFeature fr, byte altBase) {
// build haplotypes
final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), fr.read);
- final FlowBasedHaplotype[] haplotypes = buildHaplotypes(fr, rgInfo.flowOrder);
+ final FlowBasedHaplotype[] haplotypes = buildHaplotypes(fr, rgInfo.flowOrder, altBase);
// create flow read
- final FlowBasedRead flowRead = new FlowBasedRead(fr.read, rgInfo.flowOrder,
- rgInfo.maxClass, fbargs);
+ final FlowBasedRead flowRead = new FlowBasedRead(fr.read, rgInfo.flowOrder, rgInfo.maxClass, fbargs);
+
final int diffLeft = haplotypes[0].getStart() - flowRead.getStart() + fr.offsetDelta;
final int diffRight = flowRead.getEnd() - haplotypes[0].getEnd();
flowRead.applyBaseClipping(Math.max(0, diffLeft), Math.max(diffRight, 0), false);
@@ -524,16 +597,24 @@ public static double computeLikelihoodLocal(final FlowBasedRead read, final Flow
return result;
}
- private FlowBasedHaplotype[] buildHaplotypes(final MappedFeature fr, final String flowOrder) {
+ private FlowBasedHaplotype[] buildHaplotypes(final MappedFeature fr, final String flowOrder, byte altBase) {
// build bases for flow haplotypes
// NOTE!!!: this code assumes length of feature on read and reference is the same
// this is true for SNP but not for INDELs - it will have to be re-written!
// TODO: write for INDEL
- final byte[] bases = fr.read.getBasesNoCopy();
+ byte[] bases = fr.read.getBasesNoCopy();
int offset = fr.readBasesOffset;
int refStart = fr.start;
int refModOfs = 0;
+
+ // install alt base?
+ byte orgBase = 0;
+ if ( altBase != 0 ) {
+ orgBase = fr.refBases[0];
+ fr.refBases[0] = altBase;
+ }
+
if ( offset > 0 ) {
// reach into hmer before
offset--;
@@ -569,6 +650,11 @@ private FlowBasedHaplotype[] buildHaplotypes(final MappedFeature fr, final Strin
new FlowBasedHaplotype(refHaplotype, flowOrder)
};
+ // restore changes
+ if ( altBase != 0 ) {
+ fr.refBases[0] = orgBase;
+ }
+
// return
return result;
}
@@ -590,7 +676,11 @@ private void emitFeature(final MappedFeature fr) {
// create alleles
final Collection alleles = new LinkedList<>();
- alleles.add(Allele.create(fr.readBases, false));
+ if ( fmArgs.reportAllAlts && Arrays.equals(fr.readBases, fr.refBases) ) {
+ alleles.add(Allele.create("*".getBytes(), false));
+ } else {
+ alleles.add(Allele.create(fr.readBases, false));
+ }
alleles.add(Allele.create(fr.refBases, true));
// create variant context builder
@@ -625,11 +715,34 @@ private void emitFeature(final MappedFeature fr) {
vcb.attribute(VCF_SMQ_RIGHT_MEAN, fr.smqRightMean);
}
- for ( String name : fmArgs.copyAttr ) {
- if ( fr.read.hasAttribute(name) ) {
- vcb.attribute(fmArgs.copyAttrPrefix + name, fr.read.getAttributeAsString(name));
+ for ( CopyAttrInfo info : copyAttrInfo ) {
+ if ( fr.read.hasAttribute(info.name) ) {
+ final String attrName = fmArgs.copyAttrPrefix + info.name;
+ if ( info.type == VCFHeaderLineType.Integer ) {
+ vcb.attribute(attrName, fr.read.getAttributeAsInteger(info.name));
+ } else if ( info.type == VCFHeaderLineType.Float ) {
+ vcb.attribute(attrName, fr.read.getAttributeAsFloat(info.name));
+ } else {
+ vcb.attribute(attrName, fr.read.getAttributeAsString(info.name));
+ }
+ }
+ }
+
+ // validation mode?
+ if ( fmArgs.reportAllAlts ) {
+ if ( fr.scoreForBase != null ) {
+ for (int baseIndex = 0; baseIndex < SequenceUtil.VALID_BASES_UPPER.length; baseIndex++) {
+ if (!Double.isNaN(fr.scoreForBase[baseIndex])) {
+ vcb.attribute(scoreNameForBase(baseIndex), String.format("%.5f", fr.scoreForBase[baseIndex]));
+ }
+ }
}
}
+ if ( fr.adjacentRefDiff ) {
+ vcb.attribute(VCF_ADJACENT_REF_DIFF, true);
+ }
+
+ // build it!
final VariantContext vc = vcb.make();
// write to file
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapperArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapperArgumentCollection.java
index 22c13b00878..03660746d2b 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapperArgumentCollection.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapperArgumentCollection.java
@@ -32,7 +32,7 @@ enum MappingFeatureEnum {
/**
* attributes to copy from bam
**/
- @Argument(fullName = "copy-attr", doc = "attributes to copy from bam", optional = true)
+ @Argument(fullName = "copy-attr", doc = "attributes to copy from bam. format ,,. types: Integer, Float, String, Character, Flag", optional = true)
public List copyAttr = new LinkedList<>();
/**
@@ -116,4 +116,18 @@ enum MappingFeatureEnum {
@Hidden
@Argument(fullName = "surrounding-mean-quality-size", doc = "number of bases around the feature to calculate surrounding mean quality", optional = true)
public Integer surroundingMeanQualitySize = null;
+
+ /**
+ * validation mode - if not specified, this feature is off
+ **/
+ @Hidden
+ @Argument(fullName = "report-all-alts", doc = "In this mode (aka validation mode), every base of every read in the input CRAM and interval is reported, and an X_SCORE value is calculated for all 3 possible alts", optional = true)
+ public boolean reportAllAlts = false;
+
+ /**
+ * adjacent-ref-diff mode - if not specified, this feature is off
+ **/
+ @Hidden
+ @Argument(fullName = "tag-bases-with-adjacent-ref-diff", doc = "In this mode bases that have an adjacent difference from the reference on the same read are not discarded, and tagged with X_ADJACENT_REF_DIFFm", optional = true)
+ public boolean tagBasesWithAdjacentRefDiff = false;
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/SNVMapper.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/SNVMapper.java
index 05c952c317a..3f23cb5a6e6 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/SNVMapper.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/SNVMapper.java
@@ -1,7 +1,7 @@
package org.broadinstitute.hellbender.tools.walkers.featuremapping;
import htsjdk.samtools.CigarElement;
-import org.apache.commons.lang.ArrayUtils;
+import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.text.similarity.LevenshteinDistance;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.exceptions.GATKException;
@@ -21,22 +21,34 @@
public class SNVMapper implements FeatureMapper {
- final int identBefore;
- final int identAfter;
+ final int surroundBefore;
+ final int surroundAfter;
final int minCigarElementLength;
final LevenshteinDistance levDistance = new LevenshteinDistance();
final Integer smqSize;
final Integer smqSizeMean;
+ final boolean ignoreSurround;
+ final int spanBefore;
+ final int spanAfter;
+
+ final FlowFeatureMapperArgumentCollection fmArgs;
+
public SNVMapper(FlowFeatureMapperArgumentCollection fmArgs) {
- identBefore = fmArgs.snvIdenticalBases;
- identAfter = (fmArgs.snvIdenticalBasesAfter != 0) ? fmArgs.snvIdenticalBasesAfter : identBefore;
- minCigarElementLength = identBefore + 1 + identAfter;
+ surroundBefore = fmArgs.snvIdenticalBases;
+ surroundAfter = (fmArgs.snvIdenticalBasesAfter != 0) ? fmArgs.snvIdenticalBasesAfter : surroundBefore;
smqSize = fmArgs.surroundingMediaQualitySize;
smqSizeMean = fmArgs.surroundingMeanQualitySize;
+ this.fmArgs = fmArgs;
+
+ // ignore surround
+ ignoreSurround = fmArgs.reportAllAlts || fmArgs.tagBasesWithAdjacentRefDiff;
+ spanBefore = ignoreSurround ? 0 : surroundBefore;
+ spanAfter = ignoreSurround ? 0 : surroundAfter;
+ minCigarElementLength = spanBefore + 1 + spanAfter;
// adjust minimal read length
- FlowBasedRead.setMinimalReadLength(1 + 1 + identAfter);
+ FlowBasedRead.setMinimalReadLength(1 + 1 + spanAfter);
}
@Override
@@ -93,30 +105,44 @@ public void forEachOnRead(GATKRead read, ReferenceContext referenceContext, Cons
if ( length >= minCigarElementLength &&
cigarElement.getOperator().consumesReadBases() &&
cigarElement.getOperator().consumesReferenceBases() ) {
- readOfs += identBefore;
- refOfs += identBefore;
- for ( int ofs = identBefore ; ofs < length - identAfter ; ofs++, readOfs++, refOfs++ ) {
+ readOfs += spanBefore;
+ refOfs += spanBefore;
+ for ( int ofs = spanBefore ; ofs < length - spanAfter ; ofs++, readOfs++, refOfs++ ) {
- if ( ref[refOfs] != 'N' && bases[readOfs] != ref[refOfs] ) {
+ if ( ref[refOfs] != 'N' && (fmArgs.reportAllAlts || (bases[readOfs] != ref[refOfs])) ) {
// check that this is really a SNV (must be surrounded by identical ref)
boolean surrounded = true;
- for ( int i = 0 ; i < identBefore && surrounded ; i++ ) {
- if ( bases[readOfs-1-i] != ref[refOfs-1-i] ) {
+ for ( int i = 0 ; i < surroundBefore && surrounded ; i++ ) {
+ final int bIndex = readOfs-1-i;
+ final int rIndex = refOfs-1-i;
+ if ( bIndex < 0 || bIndex >= bases.length || rIndex < 0 || rIndex >= ref.length ) {
+ surrounded = false;
+ continue;
+ }
+ if ( bases[bIndex] != ref[rIndex] ) {
surrounded = false;
}
}
- for ( int i = 0 ; i < identAfter && surrounded ; i++ ) {
- if ( bases[readOfs+1+i] != ref[refOfs+1+i] ) {
+ for (int i = 0; i < surroundAfter && surrounded ; i++ ) {
+ final int bIndex = readOfs+1+i;
+ final int rIndex = refOfs+1+i;
+ if ( bIndex < 0 || bIndex >= bases.length || rIndex < 0 || rIndex >= ref.length ) {
+ surrounded = false;
+ continue;
+ }
+ if ( bases[bIndex] != ref[rIndex] ) {
surrounded = false;
}
}
- if ( !surrounded ) {
+ if ( (!fmArgs.reportAllAlts && !fmArgs.tagBasesWithAdjacentRefDiff) && !surrounded ) {
continue;
}
// add this feature
FlowFeatureMapper.MappedFeature feature = FlowFeatureMapper.MappedFeature.makeSNV(read, readOfs, ref[refOfs], referenceContext.getStart() + refOfs, readOfs - refOfs);
+ if ( (fmArgs.reportAllAlts || fmArgs.tagBasesWithAdjacentRefDiff) && !surrounded )
+ feature.adjacentRefDiff = true;
feature.nonIdentMBasesOnRead = nonIdentMBases;
feature.refEditDistance = refEditDistance;
if ( !read.isReverseStrand() )
@@ -166,8 +192,8 @@ public void forEachOnRead(GATKRead read, ReferenceContext referenceContext, Cons
features.add(feature);
}
}
- readOfs += identAfter;
- refOfs += identAfter;
+ readOfs += spanAfter;
+ refOfs += spanAfter;
} else {
@@ -243,8 +269,8 @@ public FilterStatus noFeatureButFilterAt(GATKRead read, ReferenceContext referen
cigarElement.getOperator().consumesReferenceBases() ) {
// break out if not enough clearing
- if ( (start < referenceContext.getStart() + refOfs + identBefore) ||
- (start >= referenceContext.getStart() + refOfs + length - identAfter) )
+ if ( (start < referenceContext.getStart() + refOfs + spanBefore) ||
+ (start >= referenceContext.getStart() + refOfs + length - spanAfter) )
return FilterStatus.Filtered;
int delta = start - (referenceContext.getStart() + refOfs);
@@ -255,13 +281,25 @@ public FilterStatus noFeatureButFilterAt(GATKRead read, ReferenceContext referen
// check that this is really a SNV (must be surrounded by identical ref)
boolean surrounded = true;
- for ( int i = 0 ; i < identBefore && surrounded ; i++ ) {
- if ( bases[readOfs-1-i] != ref[refOfs-1-i] ) {
+ for ( int i = 0 ; i < surroundBefore && surrounded ; i++ ) {
+ final int bIndex = readOfs-1-i;
+ final int rIndex = refOfs-1-i;
+ if ( bIndex < 0 || bIndex >= bases.length || rIndex < 0 || rIndex >= ref.length ) {
+ surrounded = false;
+ continue;
+ }
+ if ( bases[bIndex] != ref[rIndex] ) {
surrounded = false;
}
}
- for ( int i = 0 ; i < identAfter && surrounded ; i++ ) {
- if ( bases[readOfs+1+i] != ref[refOfs+1+i] ) {
+ for (int i = 0; i < surroundAfter && surrounded ; i++ ) {
+ final int bIndex = readOfs+1+i;
+ final int rIndex = refOfs+1+i;
+ if ( bIndex < 0 || bIndex >= bases.length || rIndex < 0 || rIndex >= ref.length ) {
+ surrounded = false;
+ continue;
+ }
+ if ( bases[bIndex] != ref[rIndex] ) {
surrounded = false;
}
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java
index 7efb5687ac3..23550fd75a6 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java
@@ -4,7 +4,7 @@
import com.google.common.primitives.Ints;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.VCFConstants;
-import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.annotator.*;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilder.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilder.java
index cee262d277c..7b32156c40a 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilder.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilder.java
@@ -5,7 +5,7 @@
import htsjdk.samtools.util.Locatable;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.Tuple;
-import org.apache.commons.lang.ArrayUtils;
+import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthScorer.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthScorer.java
new file mode 100644
index 00000000000..f3191fbaadb
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthScorer.java
@@ -0,0 +1,925 @@
+package org.broadinstitute.hellbender.tools.walkers.groundtruth;
+
+import com.opencsv.CSVReader;
+import htsjdk.samtools.CigarOperator;
+import htsjdk.samtools.SAMFileHeader;
+import htsjdk.samtools.SAMReadGroupRecord;
+import htsjdk.variant.variantcontext.VariantContext;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.math3.util.Precision;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.ArgumentCollection;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.barclay.argparser.ExperimentalFeature;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.programgroups.FlowBasedProgramGroup;
+import org.broadinstitute.hellbender.engine.*;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.FlowBasedAlignmentArgumentCollection;
+import org.broadinstitute.hellbender.tools.walkers.featuremapping.FlowFeatureMapper;
+import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyBasedCallerUtils;
+import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.FlowBasedAlignmentLikelihoodEngine;
+import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.LikelihoodEngineArgumentCollection;
+import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.ReadLikelihoodCalculationEngine;
+import org.broadinstitute.hellbender.utils.BaseUtils;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.clipping.ReadClipper;
+import org.broadinstitute.hellbender.utils.haplotype.FlowBasedHaplotype;
+import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
+import org.broadinstitute.hellbender.utils.read.*;
+import org.broadinstitute.hellbender.utils.report.GATKReport;
+import org.broadinstitute.hellbender.utils.report.GATKReportTable;
+
+import java.io.*;
+import java.text.DecimalFormat;
+import java.util.*;
+import java.util.zip.GZIPOutputStream;
+
+/**
+ * Converts Ultima reads into flow-based annotation, and provides some general statistics regarding
+ * quality and errors relative to the reference. Ultima data is flow-based, and thus the original computed
+ * quality refers to each flow, rather than each base. In the Ultima cram/bam, there is a per-base representation
+ * of the original flow qualities, where the original quality is distributed along each flow (homopolymer).
+ * In order to reconstitute the original flow information, the tool incorporates the information encoded
+ * in the Ultima cram/bam, and outputs both the read in flow space, as well as a conversion of the aligned
+ * reference portion into flow space, and an alignment score.
+ *
+ * Input
+ *
+ * - Ultima aligned SAM/BAM/CRAM
+ *
+ *
+ * Output
+ *
+ * - Per read ground truth information CSV and a ground truth scoring quality report, in GATK report format
+ *
+ *
+ * CSV Output Description
+ * csv with the read representation in flow space. The csv includes the following columns:
+ * ReadName
+ * ReadKey : The signal of the read at each flow according to the flow order
+ * ReadIsReversed : Whether the read is reversed in the alignment
+ * ReadMQ : The mapping quality of the read
+ * ReadRQ : The read rq value
+ * GroundTruthKey : The aligned reference section, translated into per-flow signals
+ * ReadSequence
+ * Score : A flow-based alignment score. Since the alignment is per-flow, in the case that there’s a cycle skip, the read and reference flow signals will not be aligned, and therefore the score will be inaccurate.
+ * NormalizedScore: A flow-based normalized alignment score
+ * ErrorProbability : The error of each flow (corresponds to the signals in ReadKey)
+ * ReadKeyLength
+ * GroundTruthKeyLength
+ * CycleSkipStatus : One of NS (Non Skip), PCS (Possible Cycle Skip), or CS (Cycle Skip)
+ * Cigar
+ * LowestQBaseTP
+ *
+ * GATK Report Description
+ * In the quality report (optional), the following tables are included:
+ *
+ * qualReport:error rate per qual : The error rate for each quality. Columns:
+ *
+ * - qual: The encoded quality
+ *
- count: The number of times the quality was observed
+ *
- error: The error rate of the flows with this qual
+ *
- phred: the error translated into a phred score
+ *
+ *
+ * qual_hmerReport:error rate per qual by hmer. The error rate for each quality and hmer combination. Columns:
+ *
+ * - qual: The encoded quality
+ *
- hmer: The hmer length
+ *
- count: The number of times the quality was observed
+ *
- error: The error rate of the flows with this qual
+ *
+ *
+ * qual_hmer_deviation_base_Report:error rate per qual by hmer and deviation. The count of errors for each qual, hmer, deviation and base
+ *
+ * - qual: The encoded quality
+ *
- hmer: The hmer length
+ *
- deviation: The deviation (difference in signal, relative to the reference)
+ *
- base: The base
+ *
- count: The number of times the deviation was observed
+ *
+ *
+ * Phred/qual statistics per flow position report. Various statistics for each flow position in relationship to the found quality value. Columns:
+ *
+ * - flow - flow position
+ * - count - count of observations
+ * - min - minimal observed quality
+ * - max - maximal observed quality
+ * - mean - mean observed value
+ * - median - median observed value
+ * - std - standard deviation
+ * - p1...Pn - percentil columns, accotding to the --quality-percentiles parameter
+ *
+ *
+ * Usage examples
+ *
+ * gatk GroundTruthScorer \
+ * -I input.bam \
+ * -R reference.fasta.gz
+ * -L chr20 \
+ * --output-csv output.csv \
+ * --report-file report.txt \
+ * --omit-zeros-from-report \ (optional)
+ * --features-file dbsnp.chr9.vcf.gz \ (optional)
+ * --genome-prior genome_prior.csv (optional)
+ *
+ *
+ * {@GATK.walkertype ReadWalker}
+ */
+
+@CommandLineProgramProperties(
+ summary = "Ground Truth Scorer",
+ oneLineSummary = "Score reads against a reference/ground truth",
+ programGroup = FlowBasedProgramGroup.class
+)
+@DocumentedFeature
+@ExperimentalFeature
+public class GroundTruthScorer extends ReadWalker {
+ private static final Logger logger = LogManager.getLogger(GroundTruthScorer.class);
+
+ public static final String OUTPUT_CSV_LONG_NAME = "output-csv";
+ public static final String REPORT_FILE_LONG_NAME = "report-file";
+ public static final String USE_SOFTCLIPPED_BASES_LONG_NAME = "use-softclipped-bases";
+ public static final String GENOME_PRIOR_LONG_NAME = "genome-prior";
+ public static final String FEATURES_FILE_LONG_NAME = "features-file";
+ public static final String NORMALIZED_SCORE_THRESHOLD_LONG_NAME = "normalized-score-threshold";
+ public static final String ADD_MEAN_CALL_LONG_NAME = "add-mean-call";
+ public static final String GT_NO_OUTPUT_LONG_NAME = "gt-no-output";
+ public static final String OMIT_ZEROS_FROM_REPORT = "omit-zeros-from-report";
+ public static final String QUALITY_PERCENTILES = "quality-percentiles";
+ public static final String EXCLUDE_ZERO_FLOWS = "exclude-zero-flows";
+
+ private static final int QUAL_VALUE_MAX = 60;
+ private static final int HMER_VALUE_MAX = 100; //TODO: This should become a parameter
+ private static final int BASE_VALUE_MAX = FlowBasedRead.DEFAULT_FLOW_ORDER.length() - 1;
+
+ private static final double NORMALIZED_SCORE_THRESHOLD_DEFAULT = -0.1;
+
+ /*
+ Private accumulator class for counting false/true observations (hence Boolean).
+
+ Observations are counted at a top level and are also optionally classified into a set of bins (the
+ number of which is fixed upon construction). The bins themselves are also BooleanAccumulator objects,
+ resulting in a tree like multi-level accumulator.
+
+
+ GroundTruthScores builds a four level deep accumulation tree, which can support observations of a
+ boolean event with 3-deep context (bin1,bin2,bin3).
+
+ Once accumulation is done, the instance is able to generate a suitable GATKReportTable for any given
+ bin depth (1, 2 or 3).
+
+ */
+ private static class BooleanAccumulator {
+ long falseCount;
+ long trueCount;
+ BooleanAccumulator[] bins;
+
+ // add an observation to this accumulator
+ void add(final boolean b) {
+ if (b) {
+ trueCount++;
+ } else {
+ falseCount++;
+ }
+ }
+
+ // add an observation to this accumulator and to one of the bins in the level under it (1 deep)
+ void add(final boolean b, final int bin) {
+ add(b);
+ if ( bins != null && bin >= 0 && bin < bins.length ) {
+ bins[bin].add(b);
+ } else {
+ logger.warn("bin out of range; " + bin + ", range: [0," + bins.length + "), clipped");
+ bins[Math.max(0, Math.min(bin, bins.length - 1))].add(b);
+ }
+ }
+
+ // add an observation to this accumulator and to two levels of bins under it (2 deep)
+ void add(final boolean b, final int bin, final int bin2) {
+ add(b);
+ if ( bins != null && bin >= 0 && bin < bins.length ) {
+ bins[bin].add(b, bin2);
+ } else {
+ logger.warn("bin out of range; " + bin + ", range: [0," + bins.length + "), clipped");
+ bins[Math.max(0, Math.min(bin, bins.length - 1))].add(b, bin2);
+ }
+ }
+
+ // add an observation to this accumulator and to three levels of bins under it (3 deep)
+ void add(final boolean b, final int bin, final int bin2, final int bin3) {
+ add(b);
+ if ( bins != null && bin >= 0 && bin < bins.length ) {
+ bins[bin].add(b, bin2, bin3);
+ } else {
+ logger.warn("bin out of range; " + bin + ", range: [0," + bins.length + "), clipped");
+ bins[Math.max(0, Math.min(bin, bins.length - 1))].add(b, bin2, bin3);
+ }
+ }
+
+ // get observation count of this accumulator
+ long getCount() {
+ return falseCount + trueCount;
+ }
+
+ // get the false rate/ration for this accumulator
+ double getFalseRate() {
+ return (getCount() == 0) ? 0.0 : ((double)falseCount / getCount());
+ }
+
+ // create a set of accumulators with 3-deep bin nesting
+ static BooleanAccumulator[] newReport(final int size, final int binCount, final int binCount2, final int binCount3) {
+ BooleanAccumulator[] report = new BooleanAccumulator[size];
+ for ( byte i = 0 ; i < report.length ; i++ ) {
+ report[i] = new BooleanAccumulator();
+ if ( binCount != 0 ) {
+ report[i].bins = new BooleanAccumulator[binCount];
+ for ( int j = 0 ; j < report[i].bins.length ; j++ ) {
+ report[i].bins[j] = new BooleanAccumulator();
+ if ( binCount2 != 0 ) {
+ report[i].bins[j].bins = new BooleanAccumulator[binCount2];
+ for ( int k = 0 ; k < report[i].bins[j].bins.length ; k++ ) {
+ report[i].bins[j].bins[k] = new BooleanAccumulator();
+ if (binCount3 != 0) {
+ report[i].bins[j].bins[k].bins = new BooleanAccumulator[binCount3];
+ for (int m = 0; m < report[i].bins[j].bins[k].bins.length; m++) {
+ report[i].bins[j].bins[k].bins[m] = new BooleanAccumulator();
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ return report;
+ }
+
+ // create a GATK report from a set of accumulators without nesting into their bins
+ static GATKReportTable newReportTable(final BooleanAccumulator[] report, final String name, final double probThreshold, final boolean omitZeros) {
+ final GATKReportTable table = new GATKReportTable(name + "Report", "error rate per " + name, 4);
+ table.addColumn(name, "%d");
+ table.addColumn("count", "%d");
+ table.addColumn("error", "%f");
+ table.addColumn("phred", "%d");
+ int rowIndex = 0;
+ for (int i = 0; i < report.length; i++) {
+ if ( omitZeros && i != 0 && report[i].getCount() == 0 )
+ continue;
+ else {
+ final double rate = report[i].getFalseRate();
+ final double phredRate = (rate == 0.0 && report[i].getCount() != 0 && probThreshold != 0.0) ? probThreshold : rate;
+
+ table.set(rowIndex, 0, i);
+ table.set(rowIndex, 1, report[i].getCount());
+ table.set(rowIndex, 2, rate);
+ table.set(rowIndex, 3, phredRate != 0 ? (int) Math.ceil(-10.0 * Math.log10(phredRate)) : 0);
+ rowIndex++;
+ }
+ }
+ return table;
+ }
+
+ // create a GATK report from a set of accumulators while nesting into one level of their bins (1 deep)
+ static GATKReportTable newReportTable(final BooleanAccumulator[] report, final String name1, final String name2, final boolean omitZeros) {
+ final GATKReportTable table = new GATKReportTable(name1 + "_" + name2 + "Report", "error rate per " + name1 + " by " + name2, 4);
+ table.addColumn(name1, "%d");
+ table.addColumn(name2, "%d");
+ table.addColumn("count", "%d");
+ table.addColumn("error", "%f");
+ int rowIndex = 0;
+ for (int i = 0; i < report.length; i++) {
+ for ( int j = 0; j < report[i].bins.length ; j++ ) {
+ if ( omitZeros && (i != 0 || j != 0) && report[i].bins[j].getCount() == 0 )
+ continue;
+ else {
+ table.set(rowIndex, 0, i);
+ table.set(rowIndex, 1, j);
+ table.set(rowIndex, 2, report[i].bins[j].getCount());
+ table.set(rowIndex, 3, report[i].bins[j].getFalseRate());
+ rowIndex++;
+ }
+ }
+ }
+ return table;
+ }
+
+ // create a GATK report from a set of accumulators while nesting into two levels of their bins (2 deep)
+ static GATKReportTable newReportTable(final BooleanAccumulator[] report, final String name1, final String name2, final String name3, final String name4, final boolean omitZeros) {
+ final GATKReportTable table = new GATKReportTable(name1 + "_" + name2 + "_" + name3 + "_" + name4 + "_Report", "error rate per " + name1 + " by " + name2 + " and " + name3, 5);
+ table.addColumn(name1, "%d");
+ table.addColumn(name2, "%d");
+ table.addColumn(name3, "%s");
+ table.addColumn(name4, "%s");
+ table.addColumn("count", "%d");
+ int rowIndex = 0;
+ for (int i = 0; i < report.length; i++) {
+ for ( int j = 0; j < report[i].bins.length ; j++ ) {
+ for ( int k = 0; k < report[i].bins[j].bins.length ; k++ ) {
+ for ( int m = 0; m < report[i].bins[j].bins[k].bins.length ; m++ ) {
+ if ( omitZeros && (i != 0 || j != 0 || k != 0 || m != 0) && report[i].bins[j].bins[k].bins[m].getCount() == 0 )
+ continue;
+ else {
+ table.set(rowIndex, 0, i);
+ table.set(rowIndex, 1, j);
+ table.set(rowIndex, 2, binToDeviation(k));
+ table.set(rowIndex, 3, String.format("%c", binToBase(m)));
+ table.set(rowIndex, 4, report[i].bins[j].bins[k].bins[m].getCount());
+ rowIndex++;
+ }
+ }
+ }
+ }
+ }
+ return table;
+ }
+ }
+
+ private static class PercentileReport extends SeriesStats {
+
+ static GATKReportTable newReportTable(final Vector report, String qualityPercentiles) {
+ String[] qp = qualityPercentiles.split(",");
+ final GATKReportTable table = new GATKReportTable("PhredBinAccumulator", "PhredBinAccumulator", 8 + qp.length);
+ table.addColumn("flow", "%d");
+ table.addColumn("count", "%d");
+ table.addColumn("min", "%f");
+ table.addColumn("max", "%f");
+ table.addColumn("mean", "%f");
+ table.addColumn("median", "%f");
+ table.addColumn("std", "%f");
+ for ( final String p : qp ) {
+ table.addColumn("p" + p, "%f");
+ }
+ int rowIndex = 0;
+ for ( final PercentileReport r : report ) {
+ int col = 0;
+ table.set(rowIndex, col++, rowIndex);
+ table.set(rowIndex, col++, r.getCount());
+ table.set(rowIndex, col++, r.getMin());
+ table.set(rowIndex, col++, r.getMax());
+ table.set(rowIndex, col++, r.getMean());
+ table.set(rowIndex, col++, r.getMedian());
+ table.set(rowIndex, col++, r.getStd());
+ for ( String p : qp ) {
+ table.set(rowIndex, col++, r.getPercentile(Double.parseDouble(p)));
+ }
+ rowIndex++;
+ }
+ return table;
+ }
+
+ void addProb(double p) {
+ super.add(-10 * Math.log10(p));
+ }
+ }
+
+ @Argument(fullName = OUTPUT_CSV_LONG_NAME, doc="main CSV output file. supported file extensions: .csv, .csv.gz.")
+ public GATKPath outputCsvPath = null;
+
+ @Argument(fullName = REPORT_FILE_LONG_NAME, doc="report output file.", optional = true)
+ public GATKPath reportFilePath = null;
+
+ @ArgumentCollection
+ public LikelihoodEngineArgumentCollection likelihoodArgs = new LikelihoodEngineArgumentCollection();
+
+ @ArgumentCollection
+ public FlowBasedAlignmentArgumentCollection fbargs = new FlowBasedAlignmentArgumentCollection();
+
+ @Argument(fullName = USE_SOFTCLIPPED_BASES_LONG_NAME, doc="", optional = true)
+ public boolean useSoftclippedBases;
+
+ @Argument(fullName = GENOME_PRIOR_LONG_NAME, doc="CSV input file containing genome-prior (one line per base with hmer frequencies).", optional = true)
+ public GATKPath genomePriorPath;
+
+ @Argument(fullName = FEATURES_FILE_LONG_NAME, doc="A VCF file containing features to be used as a use for filtering reads.", optional = true)
+ public FeatureDataSource features;
+
+ @Argument(fullName = NORMALIZED_SCORE_THRESHOLD_LONG_NAME, doc="threshold for normalized score, below which reads are ignored", optional = true)
+ public double normalizedScoreThreshold = NORMALIZED_SCORE_THRESHOLD_DEFAULT;
+
+ @Argument(fullName = ADD_MEAN_CALL_LONG_NAME, doc="Add ReadMeanCall and ReadProbs columns to output", optional = true)
+ public boolean addMeanCalll;
+
+ @Argument(fullName = GT_NO_OUTPUT_LONG_NAME, doc = "do not generate output records", optional = true)
+ public boolean noOutput = false;
+
+ @Argument(fullName = OMIT_ZEROS_FROM_REPORT, doc = "omit zero values from output report", optional = true)
+ public boolean omitZerosFromReport = false;
+
+ @Argument(fullName = QUALITY_PERCENTILES, doc = "list of quality percentiles, defaults to 10,25,50,75,90", optional = true)
+ public String qualityPercentiles = "10,25,50,75,90";
+
+ @Argument(fullName = EXCLUDE_ZERO_FLOWS, doc = "should flows with a call of zero be included in the percentile report?", optional = true)
+ public boolean excludeZeroFlows = false;
+
+ // locals
+ private FlowBasedAlignmentLikelihoodEngine likelihoodCalculationEngine;
+ private PrintWriter outputCsv;
+ private DecimalFormat doubleFormat = new DecimalFormat("0.0#####");
+ private GenomePriorDB genomePriorDB;
+ private BooleanAccumulator[] qualReport;
+ private String[] csvFieldOrder;
+ private Vector percentileReports;
+
+ // static/const
+ static final private String[] CSV_FIELD_ORDER_BASIC = {
+ "ReadName", "ReadKey", "ReadIsReversed", "ReadMQ", "ReadRQ", "GroundTruthKey", "ReadSequence",
+ "Score", "NormalizedScore", "ErrorProbability",
+ "ReadKeyLength", "GroundTruthKeyLength", "CycleSkipStatus", "Cigar", "LowestQBaseTP"
+ };
+ static final private String[] CSV_FIELD_ORDER_MEAN_CALL = {
+ "ReadProbs", "ReadMeanCall"
+ };
+
+ @Override
+ public void onTraversalStart() {
+ super.onTraversalStart();
+
+ // establish csv fields
+ List order = new LinkedList<>(Arrays.asList(CSV_FIELD_ORDER_BASIC));
+ if ( addMeanCalll ) {
+ order.addAll(Arrays.asList(CSV_FIELD_ORDER_MEAN_CALL));
+ }
+ csvFieldOrder = order.toArray(new String[0]);
+
+ // create likelihood engine
+ final ReadLikelihoodCalculationEngine engine = AssemblyBasedCallerUtils.createLikelihoodCalculationEngine(likelihoodArgs, false);
+ if ( engine instanceof FlowBasedAlignmentLikelihoodEngine ) {
+ likelihoodCalculationEngine = (FlowBasedAlignmentLikelihoodEngine)engine;
+ } else {
+ throw new GATKException("must use a flow based likelihood calculation engine");
+ }
+
+ // open genome prior if provided
+ if ( genomePriorPath != null ) {
+ try {
+ genomePriorDB = new GenomePriorDB(genomePriorPath);
+ } catch (IOException e) {
+ throw new GATKException("failed to open genome-prior file: " + genomePriorPath);
+ }
+ }
+
+ // open output, write header
+ try {
+ if (outputCsvPath.toPath().toString().endsWith(".gz")) {
+ outputCsv = new PrintWriter(new GZIPOutputStream(outputCsvPath.getOutputStream()));
+ } else {
+ outputCsv = new PrintWriter(outputCsvPath.getOutputStream());
+ }
+ } catch (IOException e) {
+ throw new GATKException("failed to open csv output: " + outputCsvPath, e);
+ }
+ emitCsvHeaders();
+
+ // initialize reports
+ if ( reportFilePath != null ) {
+ qualReport = BooleanAccumulator.newReport(QUAL_VALUE_MAX + 1, HMER_VALUE_MAX + 1, deviationToBin(HMER_VALUE_MAX + 1), BASE_VALUE_MAX + 1);
+
+ // establish max hmer size of flow input
+ int maxClass = FlowBasedRead.MAX_CLASS;
+ final SAMFileHeader header = getHeaderForReads();
+ if ( header != null ) {
+ for ( SAMReadGroupRecord rg : header.getReadGroups() ) {
+ if ( rg.getAttribute(FlowBasedRead.MAX_CLASS_READ_GROUP_TAG) != null ) {
+ maxClass = Math.max(maxClass, Integer.parseInt(rg.getAttribute(FlowBasedRead.MAX_CLASS_READ_GROUP_TAG)));
+ }
+ }
+ }
+ percentileReports = new Vector<>();
+ }
+ }
+
+ @Override
+ public void closeTool() {
+
+ // close main output csv
+ if ( outputCsv != null ) {
+ outputCsv.close();
+ }
+
+ // write reports
+ if ( reportFilePath != null ) {
+ final GATKReport report = new GATKReport(
+ BooleanAccumulator.newReportTable(qualReport, "qual", fbargs.probabilityRatioThreshold, omitZerosFromReport),
+ BooleanAccumulator.newReportTable(qualReport, "qual", "hmer", omitZerosFromReport),
+ BooleanAccumulator.newReportTable(qualReport, "qual", "hmer", "deviation", "base", omitZerosFromReport),
+ PercentileReport.newReportTable(percentileReports, qualityPercentiles)
+ );
+ try ( final PrintStream ps = new PrintStream(reportFilePath.getOutputStream()) ) {
+ report.print(ps);
+ }
+ }
+
+ super.closeTool();
+ }
+
+ @Override
+ public void apply(final GATKRead read, final ReferenceContext referenceContext, final FeatureContext featureContext) {
+
+ // working with unmapped reads is really not practical, as we need the reference to work out ground truth
+ if ( read.isUnmapped() )
+ return;
+ if ( referenceContext.getWindow() == null )
+ return;
+
+ // handle read clipping
+ final GATKRead clippedRead;
+ if (isSoftClipped(read) ) {
+ if (useSoftclippedBases) {
+ referenceContext.setWindow(read.getStart() - read.getUnclippedStart(), read.getUnclippedEnd() - read.getEnd());;
+ clippedRead = ReadClipper.revertSoftClippedBases(read);
+ } else {
+ clippedRead = ReadClipper.hardClipSoftClippedBases(read);
+ }
+ } else {
+ clippedRead = read;
+ }
+
+ // filter?
+ if ( (features != null) && !filter(clippedRead, referenceContext) ) {
+ return;
+ }
+
+ // create flow read/haplotype
+ final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), clippedRead);
+ final FlowBasedRead flowRead = new FlowBasedRead(clippedRead, rgInfo.flowOrder, rgInfo.maxClass, fbargs);
+ final Haplotype haplotype = new Haplotype(referenceContext.getBases(), true);
+ final FlowBasedHaplotype flowHaplotype = new FlowBasedHaplotype(haplotype, rgInfo.flowOrder);
+
+ // is this really needed?
+ if ( !flowRead.isValid() ) {
+ return;
+ }
+
+ // compute score
+ final int hapKeyLength = flowHaplotype.getKeyLength();
+ final double score = FlowFeatureMapper.computeLikelihoodLocal(flowRead, flowHaplotype, hapKeyLength, false);
+ final double normalizedScore = score / flowRead.getKeyLength();
+ if ( normalizedScore < normalizedScoreThreshold )
+ return;
+
+ // compute error probability
+ final double[] errorProb = computeErrorProb(flowRead, genomePriorDB);
+
+ // cycle skip
+ final FlowBasedReadUtils.CycleSkipStatus cycleSkipStatus = FlowBasedReadUtils.getCycleSkipStatus(flowRead, referenceContext);
+
+ // accumulate reports
+ if ( cycleSkipStatus != FlowBasedReadUtils.CycleSkipStatus.CS && qualReport != null ) {
+ addToQualReport(flowRead, referenceContext, errorProb);
+ }
+
+ // emit
+ try {
+ emit(flowRead, flowHaplotype, score, normalizedScore, errorProb, read, cycleSkipStatus);
+ } catch (IOException e) {
+ throw new GATKException("failed to write output record", e);
+ }
+ }
+
+ private boolean filter(final GATKRead read, final ReferenceContext referenceContext) {
+
+ // loop on features contained in the read, check that they are in agreement with read data
+ Iterator iter = features.query(new SimpleInterval(read));
+ byte[] ref = referenceContext.getBases();
+ while ( iter.hasNext() ) {
+ final VariantContext vc = iter.next();
+ for ( int refCoord = vc.getStart() ; refCoord <= vc.getEnd() ; refCoord++ ) {
+
+ // get byte from read
+ Optional readByte = ReadUtils.getReadBaseAtReferenceCoordinate(read, refCoord);
+ if ( !readByte.isPresent() ) {
+ return false;
+ }
+
+ // get byte from reference
+ byte refByte = ref[refCoord - referenceContext.getWindow().getStart()];
+
+ // compare
+ if ( refByte != readByte.get() ) {
+ return false;
+ }
+ }
+ }
+
+ // if here, no interference detected
+ return true;
+ }
+
+ /*
+ * compute error probability vector for a read
+ *
+ * The vector has one element for each flow key, representing the probability complementing the call-probability to 1
+ * This is further complicated by the optional presence of a genome-prior database, which provides factoring for
+ * each hmer length (on a base basis)
+ */
+ private double[] computeErrorProb(final FlowBasedRead flowRead, final GenomePriorDB genomePriorDB) {
+
+ final int[] key = flowRead.getKey();
+ final byte[] flowOrder = flowRead.getFlowOrderArray();
+
+ final double[] probCol = new double[flowRead.getMaxHmer() + 1];
+ double[] result = new double[key.length];
+
+ for ( int i = 0 ; i < key.length ; i++ ) {
+
+ // step 1 - extract column & sum
+ double sum = 0;
+ for ( int j = 0 ; j < probCol.length ; j++ ) {
+ sum += (probCol[j] = flowRead.getProb(i, j));
+ }
+
+ // step 2 - normalize column
+ if ( sum != 0.0 ) {
+ for (int j = 0; j < probCol.length; j++) {
+ probCol[j] /= sum;
+ }
+ }
+
+ // step 3 - scale according to prior genome?
+ if ( genomePriorDB != null ) {
+
+ long[] prior = genomePriorDB.getPriorForBase(flowOrder[i]);
+ sum = 0;
+ if ( prior != null ) {
+ for (int j = 0; j < probCol.length; j++) {
+ sum += (probCol[j] *= prior[j]);
+ }
+ }
+
+ // assign normalized result
+ if ( sum != 0.0 ) {
+ result[i] = 1 - (probCol[Math.min(probCol.length - 1, Math.min(key[i], flowRead.getMaxHmer()))] / sum);
+ } else {
+ // revert to non-prior normalization
+ result[i] = 1 - probCol[Math.min(key[i], flowRead.getMaxHmer())];
+ }
+ } else {
+
+ // assign normalized result
+ result[i] = 1 - probCol[Math.min(key[i], flowRead.getMaxHmer())];
+ }
+
+ // accumulate error probabilities
+ if ( percentileReports != null ) {
+ if ( key[i] != 0 || !excludeZeroFlows ) {
+ while ( percentileReports.size() < (i + 1) ) {
+ percentileReports.add(new PercentileReport());
+ }
+ percentileReports.get(i).addProb(result[i]);
+ }
+ }
+ }
+
+ return result;
+ }
+
+ /*
+ * compute lowest-quality-base-tp-value vector for a read
+ *
+ * The vector has one element for each flow key, representing the value of the tp for the hmer base
+ * which has the lowest quality value
+ *
+ * example:
+ * Bases: TTTTT
+ * Qs: ABCBA
+ * Tp: -1 1 0 1 -1
+ * Output: -1
+ */
+ private byte[] computeLowestQBaseTP(final FlowBasedRead flowRead) {
+
+ final int[] key = flowRead.getKey();
+ byte[] result = new byte[key.length];
+ final byte[] tp = flowRead.getAttributeAsByteArray("tp");
+ final byte[] qual = flowRead.getBaseQualitiesNoCopy();
+
+ // loop om key
+ int seq_i = 0;
+ for ( int i = 0 ; i < key.length ; i++ ) {
+
+ // extract hmer length, zero is easy
+ int hmer = key[i];
+ if ( hmer == 0 ) {
+ result[i] = 0;
+ continue;
+ }
+
+ // scan qualities for the lowest value, start with first
+ // as qualities and tp are symetric, we can scan up to the middle
+ // when finding the middle (offset) account for even/odd hmers
+ result[i] = tp[seq_i];
+ byte lowestQ = qual[seq_i];
+ int hmer_scan_length = (hmer + 1) / 2;
+ for ( int j = 1 ; j < hmer_scan_length ; j++ ) {
+ if ( qual[seq_i + j] < lowestQ ) {
+ result[i] = tp[seq_i + j];
+ lowestQ = qual[seq_i + j];
+ }
+ }
+
+ // advance
+ seq_i += hmer;
+ }
+
+ return result;
+ }
+
+ private void emitCsvHeaders() {
+
+ outputCsv.println(StringUtils.join(csvFieldOrder, ","));
+ }
+
+ private void emit(final FlowBasedRead flowRead, final FlowBasedHaplotype refHaplotype, double score, final double normalizedScore, final double[] errorProb,
+ GATKRead read,
+ FlowBasedReadUtils.CycleSkipStatus cycleSkipStatus) throws IOException {
+
+ // build line columns
+ final Map cols = new LinkedHashMap<>();
+
+ // read info
+ cols.put("ReadName", flowRead.getName());
+ cols.put("ReadIsReversed", flowRead.isReverseStrand() ? 1 : 0);
+ cols.put("ReadMQ", flowRead.getMappingQuality());
+ cols.put("ReadRQ", flowRead.getAttributeAsFloat("rq"));
+ cols.put("CycleSkipStatus", cycleSkipStatus);
+ cols.put("Cigar", read.getCigar().toString());
+
+
+ // keys, seq, etc
+ cols.put("ReadKey", "\"" + StringUtils.join(flowRead.getKey(), ',') + "\"");
+ cols.put("GroundTruthKey", "\"" + StringUtils.join(refHaplotype.getKey(), ',') + "\"");
+ cols.put("ReadSequence", flowRead.getBasesString());
+ cols.put("ReadKeyLength", flowRead.getKeyLength());
+ cols.put("GroundTruthKeyLength", refHaplotype.getKeyLength());
+
+ // scores
+ cols.put("Score", score);
+ cols.put("NormalizedScore", normalizedScore);
+ cols.put("ErrorProbability", "\"" + StringUtils.join(
+ Arrays.stream(errorProb).mapToObj(v -> doubleFormat.format(v)).toArray(),
+ ',') + "\"");
+
+ // lowest q base tp
+ final byte[] lowestQBaseTP = computeLowestQBaseTP(flowRead);
+ cols.put("LowestQBaseTP", "\"" + StringUtils.join(lowestQBaseTP, ',') + "\"");
+
+ // add read probabilities
+ if ( addMeanCalll ) {
+ double[][] readProbsAndMeanCall = collectReadProbs(flowRead);
+ cols.put("ReadProbs", "\"" + StringUtils.join(
+ Arrays.stream(readProbsAndMeanCall[0]).mapToObj(v -> doubleFormat.format(v)).toArray(),
+ ',') + "\"");
+ cols.put("ReadMeanCall", "\"" + StringUtils.join(
+ Arrays.stream(readProbsAndMeanCall[1]).mapToObj(v -> doubleFormat.format(v)).toArray(),
+ ',') + "\"");
+ }
+
+ // construct line
+ StringBuilder sb = new StringBuilder();
+ int colIndex = 0;
+ for ( String field : csvFieldOrder ) {
+ if ( colIndex++ > 0 ) {
+ sb.append(',');
+ }
+ if ( !cols.containsKey(field) ) {
+ throw new GATKException("column missing from csv line: " + field);
+ }
+ sb.append(cols.get(field));
+ cols.remove(field);
+ }
+ if ( cols.size() > 0 ) {
+ throw new GATKException("invalid columns on csv line: " + cols.keySet());
+ }
+
+ // output line
+ if ( !noOutput ) {
+ outputCsv.println(sb);
+ }
+ }
+
+ private double[][] collectReadProbs(final FlowBasedRead read) {
+
+ final int keyLength = read.getKeyLength();
+ final int maxHmer = read.getMaxHmer();
+ final double[] probs = new double[keyLength * (maxHmer + 1)];
+ final double[] meanCall = new double[keyLength];
+
+ // retrieve probs
+ int pos = 0;
+ for ( int flow = 0 ; flow < keyLength ; flow++ ) {
+ double mc = 0;
+ int mc_sum = 0;
+ for ( int hmer = 0 ; hmer <= maxHmer ; hmer++ ) {
+ final double p = read.getProb(flow, hmer);
+ probs[pos++] = p;
+ mc += (p * hmer);
+ mc_sum += hmer;
+ }
+ meanCall[flow] = mc / mc_sum;
+ }
+
+ return new double[][] {probs, meanCall};
+ }
+
+ static private class GenomePriorDB {
+
+ final private Map db = new LinkedHashMap<>();
+
+ GenomePriorDB(GATKPath path) throws IOException {
+
+ final CSVReader csvReader = new CSVReader(new InputStreamReader(path.getInputStream()));
+ String[] line;
+ while ( (line = csvReader.readNext()) != null ) {
+ long[] prior = new long[HMER_VALUE_MAX + 1];
+ Byte base = line[0].getBytes()[0];
+ for ( int i = 0 ; i < prior.length ; i++ ) {
+ if ( i == 0 ){
+ prior[i] = Long.parseLong(line[i+1]);
+ } else {
+ prior[i] = Long.parseLong(line[i]);
+ }
+ }
+ db.put(base, prior);
+ }
+ }
+
+ long[] getPriorForBase(byte base) {
+ return db.get(base);
+ }
+ }
+
+ private void addToQualReport(FlowBasedRead flowRead, ReferenceContext referenceContext, final double[] errorProb) {
+
+ // convert reference to key space
+ final Haplotype haplotype = new Haplotype(referenceContext.getBases(), true);
+ final FlowBasedHaplotype flowHaplotype = new FlowBasedHaplotype(haplotype, flowRead.getFlowOrder());
+
+ // access keys & flow order
+ final int[] readKey = flowRead.getKey();
+ final int[] hapKey = flowHaplotype.getKey();
+ final byte[] flowOrder = flowRead.getFlowOrder().getBytes();
+
+ // loop on key positions
+ if ( readKey.length != hapKey.length ) {
+ return;
+ }
+ for ( int flow = 0 ; flow < readKey.length ; flow++ ) {
+
+ // determine quality
+ final double prob = Precision.round(errorProb[flow], (QUAL_VALUE_MAX / 10) + 1);
+ final int qual = (int)Math.ceil(-10 * Math.log10(prob));
+
+ // determine if matches reference
+ final int deviation = readKey[flow] - hapKey[flow];
+ final boolean same = (deviation == 0);
+
+ // accumulate
+ if ( qual < qualReport.length ) {
+ int baseBin = baseToBin(flowOrder[flow % flowOrder.length], flowRead.isReverseStrand());
+ qualReport[qual].add(same, readKey[flow], deviationToBin(deviation), baseBin);
+ }
+ }
+ }
+
+ static private int baseToBin(byte base, boolean isReverseStrand) {
+ final byte trueBase = !isReverseStrand ? base : BaseUtils.simpleComplement(base);
+ return FlowBasedRead.DEFAULT_FLOW_ORDER.indexOf(trueBase);
+ }
+
+ static private byte binToBase(int bin) {
+ return (byte)FlowBasedRead.DEFAULT_FLOW_ORDER.charAt(bin);
+ }
+
+ // 0,-1,1,-2,2... -> 0,1,2,3,4...
+ static private int deviationToBin(final int deviation) {
+ if ( deviation >= 0 ) {
+ return deviation * 2;
+ } else {
+ return (-deviation * 2) - 1;
+ }
+ }
+
+ // 0,1,2,3,4... -> 0,-1,1,-2,2...
+ static private String binToDeviation(final int bin) {
+ if ( bin == 0 ) {
+ return "0";
+ } else if ( (bin % 2) == 0 ) {
+ return String.format("+%d", bin / 2);
+ } else {
+ return String.format("%d", -((bin+1) / 2));
+ }
+ }
+
+ static private boolean isSoftClipped( final GATKRead read ) {
+ if ( read.isUnmapped() )
+ return false;
+ if ( read.getCigar().getFirstCigarElement() == null )
+ return false;
+ final CigarOperator firstOperator = read.getCigar().getFirstCigarElement().getOperator();
+ final CigarOperator lastOperator = read.getCigar().getLastCigarElement().getOperator();
+ return (firstOperator == CigarOperator.SOFT_CLIP && lastOperator != CigarOperator.SOFT_CLIP) ||
+ (firstOperator != CigarOperator.SOFT_CLIP && lastOperator == CigarOperator.SOFT_CLIP);
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/SeriesStats.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/SeriesStats.java
new file mode 100644
index 00000000000..151e0ae4867
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/SeriesStats.java
@@ -0,0 +1,112 @@
+package org.broadinstitute.hellbender.tools.walkers.groundtruth;
+
+import org.apache.commons.collections.map.LazySortedMap;
+
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
+import java.util.concurrent.atomic.AtomicInteger;
+
+public class SeriesStats {
+
+ // local state
+ private double last = Double.NaN;
+ private int count = 0;
+ private double sum = 0;
+ private double min = Double.NaN;
+ private double max = Double.NaN;
+ private SortedMap bins = new TreeMap<>();
+
+ void add(double v) {
+
+ // save in simple values
+ last = v;
+ sum += v;
+ if ( count > 0 ) {
+ min = Math.min(min, v);
+ max = Math.max(max, v);
+ } else {
+ min = max = v;
+ }
+ count++;
+
+ // save in bins
+ if ( bins.containsKey(v) ) {
+ bins.get(v).incrementAndGet();
+ } else {
+ bins.put(v, new AtomicInteger(1));
+ }
+ }
+
+ public double getLast() {
+ return last;
+ }
+
+ public int getCount() {
+ return count;
+ }
+
+ public double getMin() {
+ return (count != 0) ? min : Double.NaN;
+ }
+
+ public double getMax() {
+ return (count != 0) ? max : Double.NaN;
+ }
+
+ public int getUniq() {
+ return bins.size();
+ }
+
+ public double getMean() {
+ return (count != 0) ? (sum / count) : Double.NaN;
+ }
+
+ public double getMedian() {
+ return getPercentile(50);
+ }
+
+ public double getPercentile(double precentile) {
+ if ( count == 0 ) {
+ return Double.NaN;
+ } else if ( count == 1 ) {
+ return last;
+ } else {
+
+ int percentileIndex = (int)(count * precentile / 100);
+ int index = 0;
+ for (Map.Entry entry : bins.entrySet() ) {
+ int binSize = entry.getValue().get();
+ if ( percentileIndex >= index && (percentileIndex < (index + binSize)) ) {
+ return entry.getKey();
+ }
+ index += binSize;
+ }
+
+ // if here, we need the highest entry
+ return bins.lastKey();
+ }
+ }
+
+ public double getStd() {
+
+ if (count == 0) {
+ return Double.NaN;
+ }
+
+ // calculate mean
+ double mean = getMean();
+
+ // calculate sum of sqr(deviations)
+ double vsum = 0;
+ for (Map.Entry entry : bins.entrySet()) {
+ int binSize = entry.getValue().get();
+ vsum += (Math.pow(entry.getKey() - mean, 2) * binSize);
+ }
+
+ // calculate variance and std
+ double variance = vsum / count;
+ return Math.sqrt(variance);
+ }
+
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java
index 17728fa6035..ebc0f61d3f6 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java
@@ -340,7 +340,7 @@ private AlleleLikelihoods getAlleleLikelihoodMatrix(final Alle
.forEach(alleleHaplotypeMap.get(notAllele)::add);
final AlleleLikelihoods alleleLikelihoods = readLikelihoods.marginalize(alleleHaplotypeMap);
- logger.debug(() -> String.format("GALM: %s %d %d", allele.toString(), alleleHaplotypeMap.get(allele).size(), alleleHaplotypeMap.get(notAllele).size()));
+ logger.debug(() -> String.format("GALM: %s %d %d", allele, alleleHaplotypeMap.get(allele.altAllele()).size(), alleleHaplotypeMap.get(notAllele).size()));
return alleleLikelihoods;
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCaller.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCaller.java
index f504d400b13..557e7f0add9 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCaller.java
@@ -127,6 +127,21 @@
* argument. Note however that very high ploidies (such as are encountered in large pooled experiments) may cause
* performance challenges including excessive slowness. We are working on resolving these limitations.
*
+ * For having variable ploidy in different regions, like making haploid calls outside the PAR on chrX or chrY,
+ * see the --ploidy-regions flag. The -ploidy flag sets the default ploidy to use everywhere, and --ploidy-regions
+ * should be a .bed or .interval_list with "name" column containing the desired ploidy to use in that region
+ * when genotyping. Note that variants near the boundary may not have the matching ploidy since the ploidy used will
+ * be determined using the following precedence:
+ *
+ * - ploidy given in --ploidy-regions for all intervals overlapping the active region when calling your variant
+ * (with ties broken by using largest ploidy); note ploidy interval may only overlap the active region and determine
+ * the ploidy of your variant even if the end coordinate written for your variant lies outside the given region;
+ * - ploidy given via global -ploidy flag;
+ * - ploidy determined by the default global built-in constant for humans (2).
+ *
+ *
+ * Coordinates for the PAR for CRCh38 can be found here.
+ *
* Additional Notes
*
* - When working with PCR-free data, be sure to set `-pcr_indel_model NONE` (see argument below).
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java
index 7ab58d73264..f818659bb5e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java
@@ -1,7 +1,10 @@
package org.broadinstitute.hellbender.tools.walkers.haplotypecaller;
+import htsjdk.tribble.NamedFeature;
import htsjdk.variant.variantcontext.VariantContext;
+import org.apache.arrow.util.VisibleForTesting;
import org.apache.commons.lang3.ArrayUtils;
+import org.apache.commons.lang3.SerializationUtils;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
@@ -9,6 +12,7 @@
import org.broadinstitute.hellbender.cmdline.ReadFilterArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.DbsnpArgumentCollection;
+import org.broadinstitute.hellbender.engine.FeatureDataSource;
import org.broadinstitute.hellbender.engine.FeatureInput;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.spark.AssemblyRegionArgumentCollection;
@@ -23,9 +27,11 @@
/**
* Set of arguments for the {@link HaplotypeCallerEngine}
*/
-public class HaplotypeCallerArgumentCollection extends AssemblyBasedCallerArgumentCollection implements Serializable{
+public class HaplotypeCallerArgumentCollection extends AssemblyBasedCallerArgumentCollection implements Serializable {
private static final long serialVersionUID = 1L;
+ public static final String PLOIDY_REGIONS_NAME = "ploidy-regions";
+
public static final String GQ_BAND_LONG_NAME = "gvcf-gq-bands";
public static final String GQ_BAND_SHORT_NAME = "GQB";
public static final String DO_NOT_CORRECT_OVERLAPPING_BASE_QUALITIES_LONG_NAME = "do-not-correct-overlapping-quality";
@@ -61,6 +67,9 @@ protected ReadThreadingAssemblerArgumentCollection getReadThreadingAssemblerArgu
return new HaplotypeCallerReadThreadingAssemblerArgumentCollection();
}
+ @Argument(fullName = PLOIDY_REGIONS_NAME, shortName = PLOIDY_REGIONS_NAME, doc = "Interval file with column specifying desired ploidy for genotyping models. Overrides default ploidy and user-provided --ploidy argument in specific regions.", optional = true)
+ public FeatureInput ploidyRegions = null;
+
/**
* You can use this argument to specify that HC should process a single sample out of a multisample BAM file. This
* is especially useful if your samples are all in the same file but you need to run them individually through HC
@@ -312,6 +321,12 @@ boolean isFlowBasedCallingMode() {
return flowMode != FlowMode.NONE;
}
+ // Copy method used to create new hcArgs with same fields except input ploidy model
+ public HaplotypeCallerArgumentCollection copyWithNewPloidy(int ploidy) {
+ HaplotypeCallerArgumentCollection newArgsWithNewPloidy = SerializationUtils.clone(this);
+ newArgsWithNewPloidy.standardArgs.genotypeArgs.samplePloidy = ploidy;
+ return newArgsWithNewPloidy;
+ }
/**
* the different flow modes, in terms of their parameters and their values
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java
index 71f6d64ede7..9e09eeba8a9 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java
@@ -3,7 +3,10 @@
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.reference.ReferenceSequenceFile;
+import htsjdk.samtools.util.Locatable;
+import htsjdk.samtools.util.OverlapDetector;
import htsjdk.samtools.util.RuntimeIOException;
+import htsjdk.tribble.NamedFeature;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.writer.Options;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
@@ -20,12 +23,15 @@
import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter;
import org.broadinstitute.hellbender.engine.spark.AssemblyRegionArgumentCollection;
import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.SimpleCount;
import org.broadinstitute.hellbender.tools.walkers.annotator.*;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAssignmentMethod;
+import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypingEngine;
import org.broadinstitute.hellbender.tools.walkers.genotyper.MinimalGenotypingEngine;
import org.broadinstitute.hellbender.tools.walkers.genotyper.OutputMode;
import org.broadinstitute.hellbender.tools.walkers.genotyper.StandardCallerArgumentCollection;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler;
+import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.hellbender.utils.haplotype.Event;
import org.broadinstitute.hellbender.utils.pileup.PileupBasedAlleles;
@@ -83,8 +89,31 @@ public class HaplotypeCallerEngine implements AssemblyRegionEvaluator {
protected final OutputStreamWriter assemblyDebugOutStream;
- // the genotyping engine for the isActive() determination
- private MinimalGenotypingEngine activeRegionEvaluationGenotyperEngine = null;
+ /**
+ * List of interval file entries including regions and custom ploidy values to apply in that region.
+ */
+ protected final List ploidyRegions = new ArrayList<>();
+
+ /**
+ * An OverlapDetector object for checking whether region overlaps given ploidyRegions.
+ */
+ protected final OverlapDetector ploidyRegionsOverlapDetector;
+
+ /**
+ * List of all custom ploidies provided by user
+ */
+ private final LinkedHashSet allCustomPloidies;
+
+ /**
+ * The default genotyping engine for the isActive() determination
+ */
+ private MinimalGenotypingEngine defaultActiveRegionEvaluationGenotyperEngine = null;
+
+ /**
+ * Map of user-provided ploidy values to corresponding active region genotyper. Values are checked as valid Integers during
+ * initialization, but Strings are used as keys to avoid parsing repeatedly during runtime.
+ */
+ private final HashMap ploidyToActiveEvaluationGenotyper = new HashMap<>();
protected ReadThreadingAssembler assemblyEngine = null;
@@ -93,7 +122,16 @@ public class HaplotypeCallerEngine implements AssemblyRegionEvaluator {
// If we are in PDHMM mode we need to hold onto two likelihoods engines for the fallback
private ReadLikelihoodCalculationEngine pdhmmLikelihoodCalculationEngine = null;
- protected HaplotypeCallerGenotypingEngine genotypingEngine = null;
+ /**
+ * The default genotyping engine to use for actual variant calling and genotyping in an active region.
+ */
+ protected HaplotypeCallerGenotypingEngine defaultGenotypingEngine = null;
+
+ /**
+ * Map of user-provided ploidy values to corresponding genotyper. Values are checked as valid Integers during
+ * initialization, but Strings are used as keys to avoid parsing repeatedly during runtime.
+ */
+ protected final HashMap ploidyToGenotyperMap = new HashMap<>();
private VariantAnnotatorEngine annotationEngine = null;
@@ -163,7 +201,7 @@ public class HaplotypeCallerEngine implements AssemblyRegionEvaluator {
/**
* Create and initialize a new HaplotypeCallerEngine given a collection of HaplotypeCaller arguments, a reads header,
* and a reference file
- * @param hcArgs command-line arguments for the HaplotypeCaller
+ * @param hcArgs command-line arguments for the HaplotypeCaller
* @param assemblyRegionArgs
* @param createBamOutIndex true to create an index file for the bamout
* @param createBamOutMD5 true to create an md5 file for the bamout
@@ -196,6 +234,21 @@ public HaplotypeCallerEngine(final HaplotypeCallerArgumentCollection hcArgs, Ass
HaplotypeCallerGenotypingDebugger.initialize(hcArgs.genotyperDebugOutStream);
}
+ // Parse the user provided custom ploidy regions into ploidyRegions object containing SimpleCounts
+ if (this.hcArgs.ploidyRegions != null) {
+ FeatureDataSource ploidyDataSource = new FeatureDataSource<>(this.hcArgs.ploidyRegions, FeatureDataSource.DEFAULT_QUERY_LOOKAHEAD_BASES, NamedFeature.class);
+ ploidyDataSource.forEach(r -> this.ploidyRegions.add(new SimpleCount(r)));
+ }
+
+ for (SimpleCount region : this.ploidyRegions) {
+ if (!IntervalUtils.intervalIsOnDictionaryContig(region.getInterval(), readsHeader.getSequenceDictionary())) {
+ throw new UserException("Invalid region provided for --ploidy-regions at " + region.getContig() + ":" + region.getStart() + "-" + region.getEnd() + ". Contig name or endpoint doesn't match read sequence dictionary.");
+ }
+ }
+
+ this.ploidyRegionsOverlapDetector = OverlapDetector.create(this.ploidyRegions);
+ this.allCustomPloidies = this.ploidyRegions.stream().map(SimpleCount::getCount).collect(Collectors.toCollection(LinkedHashSet::new));
+
trimmer = new AssemblyRegionTrimmer(assemblyRegionArgs, readsHeader.getSequenceDictionary());
initialize(createBamOutIndex, createBamOutMD5);
}
@@ -242,8 +295,16 @@ private void initialize(boolean createBamOutIndex, final boolean createBamOutMD5
initializeActiveRegionEvaluationGenotyperEngine();
- genotypingEngine = new HaplotypeCallerGenotypingEngine(hcArgs, samplesList, ! hcArgs.doNotRunPhysicalPhasing, hcArgs.applyBQD);
- genotypingEngine.setAnnotationEngine(annotationEngine);
+ defaultGenotypingEngine = new HaplotypeCallerGenotypingEngine(hcArgs, samplesList, ! hcArgs.doNotRunPhysicalPhasing, hcArgs.applyBQD);
+ defaultGenotypingEngine.setAnnotationEngine(annotationEngine);
+
+ // Create other custom genotyping engines if user provided ploidyRegions
+ for (final int ploidy : this.allCustomPloidies) {
+ HaplotypeCallerArgumentCollection newPloidyHcArgs = hcArgs.copyWithNewPloidy(ploidy);
+ HaplotypeCallerGenotypingEngine newGenotypingEngine = new HaplotypeCallerGenotypingEngine(newPloidyHcArgs, samplesList, ! hcArgs.doNotRunPhysicalPhasing, hcArgs.applyBQD);
+ newGenotypingEngine.setAnnotationEngine(annotationEngine);
+ this.ploidyToGenotyperMap.put(ploidy, newGenotypingEngine);
+ }
boolean isFlowBased = (hcArgs.likelihoodArgs.likelihoodEngineImplementation == ReadLikelihoodCalculationEngine.Implementation.FlowBased)
|| (hcArgs.likelihoodArgs.likelihoodEngineImplementation == ReadLikelihoodCalculationEngine.Implementation.FlowBasedHMM);
@@ -381,8 +442,18 @@ private void initializeActiveRegionEvaluationGenotyperEngine() {
// Seems that at least with some test data we can lose genuine haploid variation if we use ploidy == 1
activeRegionArgs.genotypeArgs.samplePloidy = Math.max(MINIMUM_PUTATIVE_PLOIDY_FOR_ACTIVE_REGION_DISCOVERY, hcArgs.standardArgs.genotypeArgs.samplePloidy);
- activeRegionEvaluationGenotyperEngine = new MinimalGenotypingEngine(activeRegionArgs, samplesList);
- activeRegionEvaluationGenotyperEngine.setLogger(logger);
+ defaultActiveRegionEvaluationGenotyperEngine = new MinimalGenotypingEngine(activeRegionArgs, samplesList);
+ defaultActiveRegionEvaluationGenotyperEngine.setLogger(logger);
+
+ // If custom ploidyRegions provided, create corresponding map for active region determination genotyper
+ for (final int ploidy : this.allCustomPloidies) {
+ StandardCallerArgumentCollection newPloidyActiveArgs = new StandardCallerArgumentCollection();
+ newPloidyActiveArgs.copyStandardCallerArgsFrom(activeRegionArgs);
+ newPloidyActiveArgs.genotypeArgs.samplePloidy = Math.max(MINIMUM_PUTATIVE_PLOIDY_FOR_ACTIVE_REGION_DISCOVERY, ploidy);
+ MinimalGenotypingEngine newActiveGenotypingEngine = new MinimalGenotypingEngine(newPloidyActiveArgs, samplesList);
+ newActiveGenotypingEngine.setLogger(logger);
+ this.ploidyToActiveEvaluationGenotyper.put(ploidy, newActiveGenotypingEngine);
+ }
}
/**
@@ -455,7 +526,7 @@ public VCFHeader makeVCFHeader( final SAMSequenceDictionary sequenceDictionary,
final Set headerInfo = new HashSet<>();
headerInfo.addAll(defaultToolHeaderLines);
- headerInfo.addAll(genotypingEngine.getAppropriateVCFInfoHeaders());
+ headerInfo.addAll(defaultGenotypingEngine.getAppropriateVCFInfoHeaders());
// all annotation fields from VariantAnnotatorEngine
headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions(emitReferenceConfidence()));
// all callers need to add these standard annotation header lines
@@ -523,6 +594,57 @@ public void writeHeader( final VariantContextWriter vcfWriter, final SAMSequence
vcfWriter.writeHeader(makeVCFHeader(sequenceDictionary, defaultToolHeaderLines));
}
+ /**
+ * Determines the appropriate ploidy to use at given site for different genotyping engines.
+ * @param region Current region of interest
+ * @return Ploidy value to use here given user inputs, or -1 if fall back to default
+ */
+ private int getPloidyToUseAtThisSite(Locatable region) {
+ Set overlaps = this.ploidyRegionsOverlapDetector.getOverlaps(region);
+ // Return first engine for interval overlapping this region
+ if (!overlaps.isEmpty()) {
+ Iterator intervals = overlaps.iterator();
+ int max = intervals.next().getCount();
+ while (intervals.hasNext()) {
+ int next = intervals.next().getCount();
+ if (next > max) {
+ max = next;
+ }
+ }
+ return max;
+ } else {
+ return -1; // Sentinel value to fall back to default genotyper
+ }
+ }
+
+ /**
+ * Selects appropriate active region genotyping engine for given region
+ * @param region Current region of interest
+ * @return Active genotyping engine with correct ploidy setting for given region
+ */
+ private MinimalGenotypingEngine getLocalActiveGenotyper(Locatable region) {
+ int currentPloidy = getPloidyToUseAtThisSite(region);
+ if (currentPloidy == -1) {
+ return this.defaultActiveRegionEvaluationGenotyperEngine;
+ } else {
+ return this.ploidyToActiveEvaluationGenotyper.get(currentPloidy);
+ }
+ }
+
+ /**
+ * Selects appropriate genotyping engine for given region.
+ * @param region Current region of interest, e.g. AssemblyRegion
+ * @return Genotyping engine with correct ploidy setting for given region
+ */
+ protected HaplotypeCallerGenotypingEngine getLocalGenotypingEngine(Locatable region) {
+ int currentPloidy = getPloidyToUseAtThisSite(region);
+ if (currentPloidy == -1) {
+ return this.defaultGenotypingEngine;
+ } else {
+ return this.ploidyToGenotyperMap.get(currentPloidy);
+ }
+ }
+
/**
* Given a pileup, returns an ActivityProfileState containing the probability (0.0 to 1.0) that it's an "active" site.
*
@@ -537,6 +659,8 @@ public void writeHeader( final VariantContextWriter vcfWriter, final SAMSequence
*/
@Override
public ActivityProfileState isActive(final AlignmentContext context, final ReferenceContext ref, final FeatureContext features) {
+ MinimalGenotypingEngine localActiveGenotypingEngine = getLocalActiveGenotyper(ref);
+
if (forceCallingAllelesPresent && features.getValues(hcArgs.alleles, ref).stream().anyMatch(vc -> hcArgs.forceCallFiltered || vc.isNotFiltered())) {
return new ActivityProfileState(ref.getInterval(), 1.0);
}
@@ -546,7 +670,7 @@ public ActivityProfileState isActive(final AlignmentContext context, final Refer
return new ActivityProfileState(ref.getInterval(), 0.0);
}
- final int ploidy = activeRegionEvaluationGenotyperEngine.getConfiguration().genotypeArgs.samplePloidy;
+ final int ploidy = localActiveGenotypingEngine.getConfiguration().genotypeArgs.samplePloidy;
final List noCall = GATKVariantContextUtils.noCallAlleles(ploidy); // used to noCall all genotypes until the exact model is applied
final Map splitContexts;
@@ -565,7 +689,7 @@ public ActivityProfileState isActive(final AlignmentContext context, final Refer
sample.getValue().getBasePileup().forEach(p -> PileupBasedAlleles.addMismatchPercentageToRead(p.getRead(), readsHeader, ref));
}
// The ploidy here is not dictated by the sample but by the simple genotyping-engine used to determine whether regions are active or not.
- final int activeRegionDetectionHackishSamplePloidy = activeRegionEvaluationGenotyperEngine.getConfiguration().genotypeArgs.samplePloidy;
+ final int activeRegionDetectionHackishSamplePloidy = localActiveGenotypingEngine.getConfiguration().genotypeArgs.samplePloidy;
final double[] genotypeLikelihoods = ((RefVsAnyResult) referenceConfidenceModel.calcGenotypeLikelihoodsOfRefVsAny(
activeRegionDetectionHackishSamplePloidy,
sample.getValue().getBasePileup(), ref.getBase(),
@@ -579,9 +703,9 @@ public ActivityProfileState isActive(final AlignmentContext context, final Refer
if (genotypes.size() == 1) {
// Faster implementation using exact marginalization instead of iteration
- isActiveProb = activeRegionEvaluationGenotyperEngine.calculateSingleSampleRefVsAnyActiveStateProfileValue(genotypes.get(0).getLikelihoods().getAsVector());
+ isActiveProb = localActiveGenotypingEngine.calculateSingleSampleRefVsAnyActiveStateProfileValue(genotypes.get(0).getLikelihoods().getAsVector());
} else {
- final VariantContext vcOut = activeRegionEvaluationGenotyperEngine.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getEnd(), alleles).genotypes(genotypes).make());
+ final VariantContext vcOut = localActiveGenotypingEngine.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getEnd(), alleles).genotypes(genotypes).make());
isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb(vcOut.getPhredScaledQual());
}
@@ -607,6 +731,8 @@ public ActivityProfileState isActive(final AlignmentContext context, final Refer
* @return List of variants discovered in the region (may be empty)
*/
public List callRegion(final AssemblyRegion region, final FeatureContext features, final ReferenceContext referenceContext) {
+ final HaplotypeCallerGenotypingEngine localGenotypingEngine = getLocalGenotypingEngine(region);
+
if ( hcArgs.justDetermineActiveRegions ) {
// we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work
return NO_CALLS;
@@ -799,7 +925,7 @@ public List callRegion(final AssemblyRegion region, final Featur
if (hcArgs.filterAlleles) {
logger.debug("Filtering alleles");
- AlleleFilteringHC alleleFilter = new AlleleFilteringHC(hcArgs, assemblyDebugOutStream, genotypingEngine);
+ AlleleFilteringHC alleleFilter = new AlleleFilteringHC(hcArgs, assemblyDebugOutStream, localGenotypingEngine);
//need to update haplotypes to find the alleles
EventMap.buildEventMapsForHaplotypes(readLikelihoods.alleles(),
assemblyResult.getFullReferenceWithPadding(),
@@ -849,7 +975,7 @@ public List callRegion(final AssemblyRegion region, final Featur
}
}
- final CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods(
+ final CalledHaplotypes calledHaplotypes = localGenotypingEngine.assignGenotypeLikelihoods(
haplotypes,
subsettedReadLikelihoodsFinal,
perSampleFilteredReadList,
@@ -890,7 +1016,7 @@ public List callRegion(final AssemblyRegion region, final Featur
result.addAll(referenceConfidenceModel.calculateRefConfidence(assemblyResult.getReferenceHaplotype(),
calledHaplotypes.getCalledHaplotypes(), assemblyResult.getPaddedReferenceLoc(), regionForGenotyping,
- subsettedReadLikelihoodsFinal, genotypingEngine.getPloidyModel(), calledHaplotypes.getCalls(), hcArgs.standardArgs.genotypeArgs.supportVariants != null,
+ subsettedReadLikelihoodsFinal, localGenotypingEngine.getPloidyModel(), calledHaplotypes.getCalls(), hcArgs.standardArgs.genotypeArgs.supportVariants != null,
VCpriors));
trimmingResult.nonVariantRightFlankRegion().ifPresent(flank -> result.addAll(referenceModelForNoVariation(flank, false, VCpriors)));
@@ -948,6 +1074,7 @@ protected boolean containsCalls(final CalledHaplotypes calledHaplotypes) {
* @return a list of variant contexts (can be empty) to emit for this ref region
*/
protected List referenceModelForNoVariation(final AssemblyRegion region, final boolean needsToBeFinalized, final List VCpriors) {
+ final HaplotypeCallerGenotypingEngine localGenotypingEngine = getLocalGenotypingEngine(region);
if ( emitReferenceConfidence() ) {
if ( needsToBeFinalized ) {
AssemblyBasedCallerUtils.finalizeRegion(region,
@@ -967,7 +1094,7 @@ protected List referenceModelForNoVariation(final AssemblyRegion
final List haplotypes = Collections.singletonList(refHaplotype);
return referenceConfidenceModel.calculateRefConfidence(refHaplotype, haplotypes,
paddedLoc, region, AssemblyBasedCallerUtils.createDummyStratifiedReadMap(refHaplotype, samplesList, readsHeader, region),
- genotypingEngine.getPloidyModel(), Collections.emptyList(), hcArgs.standardArgs.genotypeArgs.supportVariants != null, VCpriors);
+ localGenotypingEngine.getPloidyModel(), Collections.emptyList(), hcArgs.standardArgs.genotypeArgs.supportVariants != null, VCpriors);
}
else {
return NO_CALLS;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/RampedHaplotypeCallerEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/RampedHaplotypeCallerEngine.java
index b5b5c2cfa88..e47245c9f91 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/RampedHaplotypeCallerEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/RampedHaplotypeCallerEngine.java
@@ -460,6 +460,7 @@ private void uncollapse(final CallRegionContext context) {
}
private void filter(final CallRegionContext context) {
+ final HaplotypeCallerGenotypingEngine localGenotypingEngine = getLocalGenotypingEngine(context.region);
try {
// no need for this step?
@@ -479,7 +480,7 @@ private void filter(final CallRegionContext context) {
context.suspiciousLocations = new HashSet<>();
if (hcArgs.filterAlleles) {
logger.debug("Filtering alleles");
- AlleleFilteringHC alleleFilter = new AlleleFilteringHC(hcArgs, assemblyDebugOutStream, genotypingEngine);
+ AlleleFilteringHC alleleFilter = new AlleleFilteringHC(hcArgs, assemblyDebugOutStream, localGenotypingEngine);
//need to update haplotypes to find the alleles
EventMap.buildEventMapsForHaplotypes(context.readLikelihoods.alleles(),
context.assemblyResult.getFullReferenceWithPadding(),
@@ -520,6 +521,7 @@ private void filter(final CallRegionContext context) {
}
private void genotype(final CallRegionContext context) {
+ final HaplotypeCallerGenotypingEngine localGenotypingEngine = getLocalGenotypingEngine(context.region);
// no need for this step?
if ( context.regionVariants != null ) {
@@ -542,7 +544,7 @@ private void genotype(final CallRegionContext context) {
// haplotype containing C as reference (and vice versa). Now this is fine if all possible haplotypes are included
// in the genotyping, but we lose information if we select down to a few haplotypes. [EB]
List haplotypes = context.readLikelihoods.alleles();
- final CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods(
+ final CalledHaplotypes calledHaplotypes = localGenotypingEngine.assignGenotypeLikelihoods(
haplotypes,
context.readLikelihoods,
perSampleFilteredReadList,
@@ -582,7 +584,7 @@ private void genotype(final CallRegionContext context) {
result.addAll(referenceConfidenceModel.calculateRefConfidence(context.assemblyResult.getReferenceHaplotype(),
calledHaplotypes.getCalledHaplotypes(), context.assemblyResult.getPaddedReferenceLoc(), regionForGenotyping,
- context.readLikelihoods, genotypingEngine.getPloidyModel(), calledHaplotypes.getCalls(), hcArgs.standardArgs.genotypeArgs.supportVariants != null,
+ context.readLikelihoods, localGenotypingEngine.getPloidyModel(), calledHaplotypes.getCalls(), hcArgs.standardArgs.genotypeArgs.supportVariants != null,
context.VCpriors));
context.nonVariantRightFlankRegion.ifPresent(flank -> result.addAll(referenceModelForNoVariation(flank, false, context.VCpriors)));
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/graphs/AdaptiveChainPruner.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/graphs/AdaptiveChainPruner.java
index 095e2dab8ea..dd02b8e8d4b 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/graphs/AdaptiveChainPruner.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/graphs/AdaptiveChainPruner.java
@@ -2,7 +2,7 @@
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Multimap;
-import org.apache.commons.lang.mutable.MutableInt;
+import org.apache.commons.lang3.mutable.MutableInt;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.hellbender.tools.walkers.mutect.Mutect2Engine;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FeaturizedReadSets.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/FeaturizedReadSets.java
similarity index 53%
rename from src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FeaturizedReadSets.java
rename to src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/FeaturizedReadSets.java
index c870d9693ee..6a4d3f5c06a 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/FeaturizedReadSets.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/FeaturizedReadSets.java
@@ -1,26 +1,22 @@
-package org.broadinstitute.hellbender.tools.walkers.annotator;
+package org.broadinstitute.hellbender.tools.walkers.mutect;
import htsjdk.variant.variantcontext.Allele;
-import htsjdk.variant.variantcontext.Genotype;
-import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.VariantContext;
-import org.apache.commons.lang.StringUtils;
-import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy;
-import org.broadinstitute.hellbender.engine.FeatureContext;
-import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.tools.walkers.annotator.BaseQuality;
+import org.broadinstitute.hellbender.tools.walkers.annotator.ReadPosition;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
-import org.broadinstitute.hellbender.utils.help.HelpConstants;
import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
import org.broadinstitute.hellbender.utils.read.Fragment;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignment;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAlignmentConstants;
-import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import java.util.*;
import java.util.stream.Collectors;
@@ -31,69 +27,23 @@
* [1,2] and [3,4] and allele 2 has featurized reads [5,6] and [7,8], the annotation is
* 1,2,3,4|5,6,7,8
*/
-@DocumentedFeature(groupName= HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY,
- summary="Featurized read sets for Mutect3 training data")
-public class FeaturizedReadSets implements JumboGenotypeAnnotation {
- public static final int DEFAULT_BASE_QUALITY = 25;
-
- private static final int DEFAULT_MAX_REF_COUNT = Integer.MAX_VALUE;
+public class FeaturizedReadSets {
+ private static final Logger logger = LogManager.getLogger(FeaturizedReadSets.class);
- public static final int FEATURES_PER_READ = 11;
+ public static final int DEFAULT_BASE_QUALITY = 25;
private static final SmithWatermanAligner aligner = SmithWatermanAligner.getAligner(SmithWatermanAligner.Implementation.JAVA);
- // downsample ref reads to this count if needed
- private final int maxRefCount;
-
- public FeaturizedReadSets(final int maxRefCount) {
- this.maxRefCount = maxRefCount;
- }
-
- public FeaturizedReadSets() {
- this(DEFAULT_MAX_REF_COUNT);
- }
-
- @Override
- public void annotate(final ReferenceContext ref,
- final FeatureContext features,
- final VariantContext vc,
- final Genotype g,
- final GenotypeBuilder gb,
- final AlleleLikelihoods likelihoods,
- final AlleleLikelihoods fragmentLikelihoods,
- final AlleleLikelihoods haplotypeLikelihoods) {
- Utils.nonNull(vc);
- Utils.nonNull(g);
- Utils.nonNull(gb);
-
- if ( likelihoods == null) {
- return;
- }
-
- final List>> readVectorsByAllele = getReadVectors(vc, Collections.singletonList(g.getSampleName()),
- likelihoods, haplotypeLikelihoods, maxRefCount, Integer.MAX_VALUE);
-
- // flatten twice: over all reads supporting each allele and over all alleles
- // we can partition by allele with the countsInAlleleOrder annotation
- // and by read using the constant feature vector size
- final int[] flattenedTensorInAlleleOrder = readVectorsByAllele.stream()
- .flatMap(listOfLists -> listOfLists.stream().flatMap(List::stream))
- .mapToInt(n -> n)
- .toArray();
-
- final int[] countsInAlleleOrder = readVectorsByAllele.stream().mapToInt(List::size).toArray();
-
- gb.attribute(GATKVCFConstants.FEATURIZED_READ_SETS_KEY, flattenedTensorInAlleleOrder);
- gb.attribute(GATKVCFConstants.FEATURIZED_READ_SETS_COUNTS_KEY, countsInAlleleOrder);
- }
+ private FeaturizedReadSets() { }
public static List>> getReadVectors(final VariantContext vc,
final Collection samples,
final AlleleLikelihoods likelihoods,
final AlleleLikelihoods haplotypeLikelihoods,
final int refDownsample,
- final int altDownsample) {
- return getReadVectors(vc, samples, likelihoods, haplotypeLikelihoods, refDownsample, altDownsample, Collections.emptyMap());
+ final int altDownsample,
+ final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) {
+ return getReadVectors(vc, samples, likelihoods, haplotypeLikelihoods, refDownsample, altDownsample, Collections.emptyMap(), mutect3DatasetMode);
}
// returns Lists (in allele order) of lists of read vectors supporting each allele
@@ -103,7 +53,8 @@ public static List>> getReadVectors(final VariantContext vc,
final AlleleLikelihoods haplotypeLikelihoods,
final int refDownsample,
final int altDownsample,
- final Map altDownsampleMap) {
+ final Map altDownsampleMap,
+ final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) {
final Map> readsByAllele = likelihoods.alleles().stream()
.collect(Collectors.toMap(a -> a, a -> new ArrayList<>()));
@@ -126,17 +77,14 @@ public static List>> getReadVectors(final VariantContext vc,
.forEach(ba -> ba.evidence.getReads().forEach(read -> bestHaplotypes.put(read, ba.allele)));
return vc.getAlleles().stream()
- .map(allele -> readsByAllele.get(allele).stream().map(read -> featurize(read, vc, bestHaplotypes)).collect(Collectors.toList()))
+ .map(allele -> readsByAllele.get(allele).stream().map(read -> featurize(read, vc, bestHaplotypes, mutect3DatasetMode)).collect(Collectors.toList()))
.collect(Collectors.toList());
}
- @Override
- public List getKeyNames() {
- return Arrays.asList(GATKVCFConstants.FEATURIZED_READ_SETS_KEY, GATKVCFConstants.FEATURIZED_READ_SETS_COUNTS_KEY);
- }
-
- private static List featurize(final GATKRead read, final VariantContext vc, final Map bestHaplotypes) {
+ private static List featurize(final GATKRead read, final VariantContext vc,
+ final Map bestHaplotypes,
+ final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) {
final List result = new ArrayList<>();
result.add(read.getMappingQuality());
result.add(BaseQuality.getBaseQuality(read, vc).orElse(DEFAULT_BASE_QUALITY));
@@ -151,23 +99,41 @@ private static List featurize(final GATKRead read, final VariantContext
result.add(Math.abs(read.getFragmentLength()));
- // distances from ends of fragment
- final int fragmentStart = Math.min(read.getMateStart(), read.getUnclippedStart());
- final int fragmentEnd = fragmentStart + Math.abs(read.getFragmentLength());
- result.add(vc.getStart() - fragmentStart);
- result.add(fragmentEnd - vc.getEnd());
+ if (mutect3DatasetMode == M2ArgumentCollection.Mutect3DatasetMode.ILLUMINA) {
+ // distances from ends of fragment
+ final int fragmentStart = Math.min(read.getMateStart(), read.getUnclippedStart());
+ final int fragmentEnd = fragmentStart + Math.abs(read.getFragmentLength());
+ result.add(vc.getStart() - fragmentStart);
+ result.add(fragmentEnd - vc.getEnd());
+ }
+
+ // Ultima-specific read tags
+ if (mutect3DatasetMode == M2ArgumentCollection.Mutect3DatasetMode.ULTIMA) {
+ result.add(read.getAttributeAsInteger("si")); // si is an integer on the order of 100s or 1000s
+ result.add((int) (1000*read.getAttributeAsFloat("rq"))); // rq is a float from 0 and 1, so we multiply by 1000 and round
+ }
// mismatches versus best haplotype
final Haplotype haplotype = bestHaplotypes.get(read);
- final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotype.getBases(), read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
- final GATKRead copy = read.copy();
- copy.setCigar(readToHaplotypeAlignment.getCigar());
- final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotype.getBases(), readToHaplotypeAlignment.getAlignmentOffset()).numMismatches;
- result.add(mismatchCount);
-
- final long indelsVsBestHaplotype = readToHaplotypeAlignment.getCigar().getCigarElements().stream().filter(el -> el.getOperator().isIndel()).count();
- result.add((int) indelsVsBestHaplotype);
- Utils.validate(result.size() == FEATURES_PER_READ, "Wrong number of features");
+
+ // TODO: fix this
+ // I have no idea why this edge case occurs in Ultima data and maybe sometimes in Illumina
+ if (!bestHaplotypes.containsKey(read)) {
+ logger.warn(String.format("Best haplotypes don't contain read %s at position %s:%d.", read.getName(),
+ vc.getContig(), vc.getStart()));
+ result.add(3);
+ result.add(2);
+ } else {
+ final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotype.getBases(), read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
+ final GATKRead copy = read.copy();
+ copy.setCigar(readToHaplotypeAlignment.getCigar());
+ final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotype.getBases(), readToHaplotypeAlignment.getAlignmentOffset()).numMismatches;
+ result.add(mismatchCount);
+
+ final long indelsVsBestHaplotype = readToHaplotypeAlignment.getCigar().getCigarElements().stream().filter(el -> el.getOperator().isIndel()).count();
+ result.add((int) indelsVsBestHaplotype);
+ }
+ Utils.validate(result.size() == mutect3DatasetMode.getNumReadFeatures(), "Wrong number of features");
return result;
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java
index d25c7a33fc5..0087441ae1a 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java
@@ -81,10 +81,11 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection
public static final String MUTECT3_ALT_DOWNSAMPLE_LONG_NAME = "mutect3-alt-downsample";
public static final String MUTECT3_DATASET_LONG_NAME = "mutect3-dataset";
public static final String MUTECT3_TRAINING_TRUTH_LONG_NAME = "mutect3-training-truth";
+ public static final String MUTECT3_DATASET_MODE_LONG_NAME = "mutect3-dataset-mode";
public static final int DEFAULT_MUTECT3_REF_DOWNSAMPLE = 10;
public static final int DEFAULT_MUTECT3_ALT_DOWNSAMPLE = 20;
- public static final int DEFAULT_MUTECT3_NON_ARTIFACT_RATIO = 20;
+ public static final int DEFAULT_MUTECT3_NON_ARTIFACT_RATIO = 1;
@Override
protected int getDefaultMaxMnpDistance() { return 1; }
@@ -205,6 +206,25 @@ public double getDefaultAlleleFrequency() {
@Argument(fullName = MUTECT3_DATASET_LONG_NAME, optional = true, doc="Destination for Mutect3 data collection")
public File mutect3Dataset;
+ @Advanced
+ @Argument(fullName = MUTECT3_DATASET_MODE_LONG_NAME, optional = true, doc="The type of Mutect3 dataset. Depends on sequencing technology.")
+ public Mutect3DatasetMode mutect3DatasetMode = Mutect3DatasetMode.ILLUMINA;
+
+ public enum Mutect3DatasetMode {
+ ILLUMINA(11),
+ ULTIMA(11);
+
+ final private int numReadFeatures;
+
+ Mutect3DatasetMode(final int numReadFeatures) {
+ this.numReadFeatures = numReadFeatures;
+ }
+
+ public int getNumReadFeatures() {
+ return numReadFeatures;
+ }
+ }
+
/**
* VCF of known calls for a sample used for generating a Mutect3 training dataset. Unfiltered variants (PASS or empty FILTER field)
* contained in this VCF are considered good; other variants (i.e. filtered in this VCF or absent from it) are considered errors.
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java
index e6a080cd14f..942c2de15b2 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java
@@ -4,18 +4,15 @@
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.commons.lang3.tuple.Triple;
-import org.apache.commons.math3.util.CombinatoricsUtils;
import org.apache.commons.math3.util.FastMath;
-import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.tools.walkers.ReferenceConfidenceVariantContextMerger;
import org.broadinstitute.hellbender.tools.walkers.annotator.AssemblyComplexity;
-import org.broadinstitute.hellbender.tools.walkers.annotator.FeaturizedReadSets;
import org.broadinstitute.hellbender.tools.walkers.annotator.ReferenceBases;
import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.Mutect2FilteringEngine;
import org.broadinstitute.hellbender.utils.IndexRange;
import org.broadinstitute.hellbender.utils.MathUtils;
-import org.broadinstitute.hellbender.utils.NaturalLogUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.genotyper.LikelihoodMatrix;
@@ -46,9 +43,6 @@ private enum Label {
ARTIFACT, VARIANT, UNLABELED, IGNORE
}
- // number of features for each vectorized read
- private static final int FEATURES_PER_READ = FeaturizedReadSets.FEATURES_PER_READ;
-
// number of additional variant features for assembly complexity, reference context
private static final int NUM_EXTRA_FEATURES = 9;
@@ -108,7 +102,8 @@ public Mutect3DatasetEngine(final File datasetFile, final boolean trainingMode,
public void addData(final ReferenceContext ref, final VariantContext vc, Optional> truthVCs,
final AlleleLikelihoods likelihoods,
final AlleleLikelihoods logFragmentLikelihoods,
- final AlleleLikelihoods logFragmentAlleleLikelihoods) {
+ final AlleleLikelihoods logFragmentAlleleLikelihoods,
+ final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) {
final String refBases = ReferenceBases.annotate(ref, vc);
final String refAllele = vc.getReference().getBaseString();
final String contig = vc.getContig();
@@ -132,6 +127,20 @@ public void addData(final ReferenceContext ref, final VariantContext vc, Optiona
final int normalDepth = (int) MathUtils.sum(normalADs);
final boolean hasNormal = normalDepth > 0;
+ final List allRefAlleles = new ArrayList<>();
+ allRefAlleles.add(vc.getReference());
+ truthVCs.ifPresent(vcs -> vcs.forEach(var -> allRefAlleles.add(var.getReference())));
+ final Allele longestRef = allRefAlleles.stream().sorted(Comparator.comparingInt(Allele::length).reversed()).findFirst().get();
+
+ // skip(1) excludes the reference allele
+ final List remappedAltAlleles = ReferenceConfidenceVariantContextMerger.remapAlleles(vc, longestRef).stream()
+ .skip(1).toList();
+
+ final Set truthAlleles = !truthVCs.isPresent() ? Collections.emptySet() : truthVCs.get().stream()
+ .filter(truthVC -> ! truthVC.isFiltered())
+ .flatMap(truthVC -> ReferenceConfidenceVariantContextMerger.remapAlleles(truthVC, longestRef).stream())
+ .collect(Collectors.toSet());
+
final List