Projects information

Chunk

Conflicting content
* the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like * to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would * normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that <<<<<<< HEAD * the AD isn't necessarily calculated exactly for indels. Only reads which are statistically favoring one allele over the other are counted. * Because of this fact, the sum of AD may be much lower than the individual sample depth, especially when there are * many non-informatice reads. ======= * the AD isn't necessarily calculated exactly for indels (it counts as non-reference only those indels that * are unambiguously informative about the alternate allele). Because of this fact and >>>>>>> 293e682e54a6cb8c77f415f0445251121bb3e91d * because the AD includes reads and bases that were filtered by the Unified Genotyper, *one should not base assumptions about the underlying genotype based on it*; instead, the genotype likelihoods (PLs) are what determine the genotype calls.

Conflicting content

 * the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like
 * to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would
 * normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that
<<<<<<< HEAD
 * the AD isn't necessarily calculated exactly for indels. Only reads which are statistically favoring one allele over the other are counted.
 * Because of this fact, the sum of AD may be much lower than the individual sample depth, especially when there are
 * many non-informatice reads.
=======
 * the AD isn't necessarily calculated exactly for indels (it counts as non-reference only those indels that
 * are unambiguously informative about the alternate allele). Because of this fact and
>>>>>>> 293e682e54a6cb8c77f415f0445251121bb3e91d
 * because the AD includes reads and bases that were filtered by the Unified Genotyper, one should not base
 * assumptions about the underlying genotype based on it; instead, the genotype likelihoods (PLs) are what
 * determine the genotype calls.

Solution content
* the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like * to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would * normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that * the AD isn't necessarily calculated exactly for indels. Only reads which are statistically favoring one allele over the other are counted. * Because of this fact, the sum of AD may be different than the individual sample depth, especially when there are * many non-informatice reads. * Because the AD includes reads and bases that were filtered by the Unified Genotyper and in case of indels is based on a statistical computation, * one should not base assumptions about the underlying genotype based on it; * instead, the genotype likelihoods (PLs) are what determine the genotype calls. */ public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation { public void annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, final ReferenceContext ref, final AlignmentContext stratifiedContext, final VariantContext vc, final Genotype g, final GenotypeBuilder gb, final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { if ( g == null \|\| !g.isCalled() ) return; if (alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty()) annotateWithLikelihoods(alleleLikelihoodMap, ref.getBase(), vc, gb); else if ( vc.isSNP() && stratifiedContext != null) annotateWithPileup(stratifiedContext, vc, gb); } private void annotateWithPileup(final AlignmentContext stratifiedContext, final VariantContext vc, final GenotypeBuilder gb) { HashMap alleleCounts = new HashMap(); for ( Allele allele : vc.getAlleles() ) alleleCounts.put(allele.getBases()[0], 0); ReadBackedPileup pileup = stratifiedContext.getBasePileup(); for ( PileupElement p : pileup ) { if ( alleleCounts.containsKey(p.getBase()) ) alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+1); } // we need to add counts in the correct order int[] counts = new int[alleleCounts.size()]; counts[0] = alleleCounts.get(vc.getReference().getBases()[0]); for (int i = 0; i < vc.getAlternateAlleles().size(); i++) counts[i+1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]); gb.AD(counts); } private void annotateWithLikelihoods(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final byte refBase, final VariantContext vc, final GenotypeBuilder gb) { final HashMap alleleCounts = new HashMap(); for ( final Allele allele : vc.getAlleles() ) { alleleCounts.put(allele, 0); } for (Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); if (a.isNoCall()) continue; // read is non-informative if (!vc.getAlleles().contains(a)) continue; // sanity check - shouldn't be needed alleleCounts.put(a,alleleCounts.get(a)+1); } final int[] counts = new int[alleleCounts.size()]; counts[0] = alleleCounts.get(vc.getReference()); for (int i = 0; i < vc.getAlternateAlleles().size(); i++) counts[i+1] = alleleCounts.get( vc.getAlternateAllele(i) ); gb.AD(counts); } public List getKeyNames() { return Arrays.asList(VCFConstants.GENOTYPE_ALLELE_DEPTHS); } public List getDescriptions() { return Arrays.asList(VCFStandardHeaderLines.getFormatLine(getKeyNames().get(0))); } }

Solution content

 * the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like
 * to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would
 * normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that
 * the AD isn't necessarily calculated exactly for indels. Only reads which are statistically favoring one allele over the other are counted.
 * Because of this fact, the sum of AD may be different than the individual sample depth, especially when there are
 * many non-informatice reads.
 * Because the AD includes reads and bases that were filtered by the Unified Genotyper and in case of indels is based on a statistical computation,
 * one should not base assumptions about the underlying genotype based on it;
 * instead, the genotype likelihoods (PLs) are what determine the genotype calls.
 */
public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation {

    public void annotate(final RefMetaDataTracker tracker,
                         final AnnotatorCompatible walker,
                         final ReferenceContext ref,
                         final AlignmentContext stratifiedContext,
                         final VariantContext vc,
                         final Genotype g,
                         final GenotypeBuilder gb,
                         final PerReadAlleleLikelihoodMap alleleLikelihoodMap) {
        if ( g == null || !g.isCalled() )
            return;

        if (alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty())
            annotateWithLikelihoods(alleleLikelihoodMap, ref.getBase(), vc, gb);
        else if ( vc.isSNP() && stratifiedContext != null)
            annotateWithPileup(stratifiedContext, vc, gb);
    }

    private void annotateWithPileup(final AlignmentContext stratifiedContext, final VariantContext vc, final GenotypeBuilder gb) {

        HashMap alleleCounts = new HashMap();
        for ( Allele allele : vc.getAlleles() )
            alleleCounts.put(allele.getBases()[0], 0);

        ReadBackedPileup pileup = stratifiedContext.getBasePileup();
        for ( PileupElement p : pileup ) {
            if ( alleleCounts.containsKey(p.getBase()) )
                alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+1);
        }

        // we need to add counts in the correct order
        int[] counts = new int[alleleCounts.size()];
        counts[0] = alleleCounts.get(vc.getReference().getBases()[0]);
        for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
            counts[i+1] = alleleCounts.get(vc.getAlternateAllele(i).getBases()[0]);

        gb.AD(counts);
    }

    private void annotateWithLikelihoods(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final byte refBase, final VariantContext vc, final GenotypeBuilder gb) {
        final HashMap alleleCounts = new HashMap();

        for ( final Allele allele : vc.getAlleles() ) {
            alleleCounts.put(allele, 0);
        }
        for (Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) {
            final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
            if (a.isNoCall())
                continue; // read is non-informative
            if (!vc.getAlleles().contains(a))
                continue; // sanity check - shouldn't be needed
            alleleCounts.put(a,alleleCounts.get(a)+1);

        }
        final int[] counts = new int[alleleCounts.size()];
        counts[0] = alleleCounts.get(vc.getReference());
        for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
            counts[i+1] = alleleCounts.get( vc.getAlternateAllele(i) );

        gb.AD(counts);
    }

    public List getKeyNames() { return Arrays.asList(VCFConstants.GENOTYPE_ALLELE_DEPTHS); }

    public List getDescriptions() {
        return Arrays.asList(VCFStandardHeaderLines.getFormatLine(getKeyNames().get(0)));
    }
}

File
DepthPerAlleleBySample.java

Developer's decision
Manual

Kind of conflict
Comment

Projects >> gatk >>c98946a469d08e5121a6269c85a37853270ed4b1