Subject: | Need to process CIGAR N in dna function |
Date: | Fri, 4 Mar 2011 15:28:53 -0800 |
To: | "bug-Bio-SamTools [...] rt.cpan.org" <bug-Bio-SamTools [...] rt.cpan.org> |
From: | Yi-Shiou Chen <Yi-Shiou.Chen [...] accelrys.com> |
According to samtools's calmd function, the MD tag of BAM doesn't carry base information for skipping segments (i.e. N in CIGAR). For example, a read with 40M100N35N in CIGAR can has 75 in MD, without any information about the 100N. The dna function of Bio::DB::Bam::AlignWrapper doesn't consider this case and returns wrong reference sequences for reads with N in CIGAR. Propose to fix the function as follows.
Thanks,
Yi-Shiou Chen
Accelrys
sub dna {
my $self = shift;
my $sam = $self->{sam};
if (my $md = $self->get_tag_values('MD')) { # try to use MD string
my $qseq = $self->qseq;
#preprocess qseq using cigar array
my $cigar = $self->cigar_array;
my $seq = '';
my @Npos = ();
my @Ncount = ();
for my $op (@$cigar) {
my ($operation,$count) = @$op;
if ($operation eq 'M') {
$seq .= substr($qseq,0,$count,''); # include these residues
} elsif ($operation eq 'N') {
# record "N" positions
push @Npos, length($seq); # 0-based
push @Ncount, $count;
my $ns = $self->start + length($seq);
# insert bases in "N" segment
my $nseq;
if(defined $self->{sam}) {
$nseq = $self->{sam}->seq($self->seq_id, $ns, $ns + $count - 1);
} else{
$nseq = 'N' x $count;
}
$seq .= $nseq;
} elsif ($operation eq 'S' or $operation eq 'I') {
substr($qseq,0,$count,''); # skip soft clipped and inserted residues
}
}
my $start = 0;
my $result;
while ($md =~ /(\d+)|\^([gatcn]+)|([gatcn]+)/ig) {
if (defined $1) {
my $c = $1;
while($c > 0) {
if(@Npos == 0) {
$result .= substr($seq,$start,$c);
$start += $c;
last;
}
# may cross N segments, need to copy piece by piece
if(($start + $c) > $Npos[0]) {
my $toAdd = $Npos[0] - $start;
$result .= substr($seq, $start, $toAdd).substr($seq, $Npos[0], $Ncount[0]);
$start += $toAdd + $Ncount[0];
$c -= $toAdd;
shift @Npos;
shift @Ncount;
} else {
$result .= substr($seq, $start, $c);
$start += $c;
last;
}
}
} elsif ($2) {
$result .= $2;
} elsif ($3) {
$result .= $3;
$start += length $3;
# insert N segments
while(@Npos > 0 and $start >= $Npos[0]) {
my $offset = $start - $Npos[0];
$result = substr($result, 0, length($result) - $offset).substr($seq, $Npos[0], $Ncount[0]).substr($result, length($result) - $offset);
$start += $Ncount[0];
shift @Npos;
shift @Ncount;
}
}
}
return $result;
}
else {
return $self->{sam}->seq($self->seq_id,$self->start,$self->end);
}
}
Message body is not shown because it is too large.