########################################################################### # FaIndex.pm --> FASTA-file index package # # Copyright 2010,2012 Philip Johnson. # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published # by the Free Software Foundation, either version 3 of the License, # or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License at for # more details. # ########################################################################### =head1 NAME FaIndex - FASTA-file indexer =head1 SYNOPSIS use lib <~>.'/bin'; use FaIndex; #to automatically create index (& save to $idxFilename): my $idx = FaIndex->new(fa => \@fastaFiles, alias => $aliasFile, idx => $idxFilename); # or if already created: my $idx = FaIndex->new(idx => $idxFilename); #pre-map id for fast access to multiple non-contiguous positions $idx->MapID(acc); #all coordinates are 1-based print $idx->GetPos($pos1), $idx->GetPos($pos2); print $idx->GetSeq($id, $start, $stop); =head1 DESCRIPTION FaIndex creates a index of FASTA sequences stored in the specified files (@fastaFiles). Uses memory mapping to quickly extract subsequences from fasta files. Makes critical assumption that all data lines (aside from the last) in the FASTA file(s) are the same length! =head1 CONSTRUCTOR =head2 FaIndex->new(%) $idx = FaIndex->new(idx => $idxFile, fa => [$file1, $file2], |=~-pure optional arguments below-~=| alias => $aliasFile, gapEndOfSeq => 1); Creates/loads a new index and returns object. =over 4 =item idx Filename for the index (not required if *fa* argument supplied). If file exists, will load. If file doesn't exist, should supply *fa* argument (in this case, index will be both created and saved to disk). =item fa Reference to array of fasta filenames (not required if *idx* argument supplied and index exists on disk). =item alias Optional. Provides alternative id's for sequences (used when building index). =item gapEndOfSeq Optional. Return '-' characters for positions off the end of the sequence rather than throwing an error. =item undefOutOfRange Optional. Return undef if unmapped id or position off the end of the sequence rather than throwing an error. =back =head1 OTHER METHODS =head2 Save($) $idx->Save("somefile.idx"); =head2 Load($) $idx->Load("somefile.idx"); =head2 MapId($) $idx->MapId("chr12"); Mmaps chromosome in prepartion for future calls to GetPos() =head2 GetIds() my @ids = $idx->GetIds(); Returns list of fasta id strings currently loaded in index. =head2 GetLength($) my $faLen = $idx->GetLength("chr12"); Returns length of specified fasta sequence. =head2 GetPos($) $idx->GetPos(10); Returns specified position in *previously mapped* id. First position in sequence is numbered 1. =head2 GetSeq($$$) $idx->GetSeq("chr12", $start, $stop); Returns specified subsequence of id. First position in sequence is numbered 1. =head1 AUTHOR - Philip Johnson =cut package FaIndex; use strict; use Carp; use File::Spec; use File::Map; use IO::File; our($VERSION) = 1; #----------------------------------------------------------------------------- # PRE : args: fa => fasta files, alias => aliasFile, idx => indexFile # POST: FaIndex object constructed sub new($%) { my ($class, %args) = @_; my $this = {}; $this->{m_LoadedAcc} = undef; $this->{m_Filename} = ''; $this->{m_Idx} = {}; $this->{m_faLen} = 0; $this->{m_Opt_GapEndOfSeq} = defined $args{gapEndOfSeq} && $args{gapEndOfSeq}; $this->{m_Opt_UndefOutOfRange} = defined $args{undefOutOfRange} && $args{undefOutOfRange}; if (defined $args{idx}) { my $upToDate = 1; if (-e $args{idx} && defined $args{fa}) { for (my $i = 0; $i < @{$args{fa}} && $upToDate; ++$i) { $upToDate = 0 if (-M $args{idx} > -M $args{fa}[$i]); } warn("Old index.. attempting to rebuild.\n") if !$upToDate; } if ($upToDate && Load($this, $args{idx})) { return bless $this, $class; } if (!defined $args{fa} || @{$args{fa}} == 0) { die "Could not load index ($args{idx}) and did not supply fasta files.\n"; } } if (exists($args{fa})) { CreateIndex($this->{m_Idx}, $args{fa}); AddIdxAliases($this->{m_Idx}, $args{alias}); if (defined($args{idx})) { Save($this, $args{idx}); } } return bless $this, $class; } #----------------------------------------------------------------------------- # PRE : none # POST: FaIndex object destroyed sub DESTROY { my $this = shift; File::Map::unmap(${$this->{m_Mmap}}) if (defined $this->{m_Mmap}); } #------------------------------------------------------------------------------ # PRE : index, reference to array of fasta files # POST: fasta files added to index sub CreateIndex { my ($idx, $fastaFiles) = @_; my $id; my ($faLen, $lineLen, $eol) = (0,0,0); foreach my $fn (@$fastaFiles) { croak "Could not find file ($fn)\n" if (!-e $fn); warn("Indexing $fn..\n"); $fn = File::Spec->rel2abs($fn); my $FAFILE = IO::File->new($fn) or croak $!; while (defined(my $line = <$FAFILE>)) { if ($line =~ /^>/) { if (defined($id)) { #update previous w/ fasta length $idx->{$id}[4] = $faLen; } ($id) = $line =~ /^>(.+?)\s/; my $offset = tell($FAFILE); $line = <$FAFILE>; chomp $line; $eol = tell($FAFILE) - $offset - length($line); $lineLen = length($line); $faLen = length($line); #entry below is just a place holder $idx->{$id} = [$fn, $offset, $eol, $lineLen, $faLen]; foreach my $altId (split /\|/, $id) { if ($altId ne 'gi' && $altId ne 'ref' && !exists($idx->{$altId})) { $idx->{$altId} = $idx->{$id}; } } } else { $faLen += length($line) - $eol; } } #NOTE: this length might be off by $eol if no EOL at EOF if (defined($id)) { #update previous w/ fasta length $idx->{$id}[4] = $faLen; } } } #------------------------------------------------------------------------------ # PRE : alias file, index to which to add # POST: all appropriate aliases added sub AddIdxAliases { my ($idx, $aliasFile) = @_; return if !defined $aliasFile || !-e $aliasFile; my $ALIASFILE; if ($aliasFile =~ 'chr_NC_gi$') { #special case it $ALIASFILE = IO::File->new(q^perl -nae 'print qq*$F[2]\t$F[0]|chr$F[0]\n*' ^.$aliasFile.'|'); } else { $ALIASFILE = IO::File->new($aliasFile); } while (defined(my $line = <$ALIASFILE>)) { chomp $line; my ($origId, $aliasId) = split /\t/, $line; if (exists($idx->{$origId})) { foreach my $id (split /\|/, $aliasId) { $idx->{$id} = $idx->{$origId}; } } } } #----------------------------------------------------------------------------- # PRE : FaIndex created; filename to save index to. # POST: index written to disk sub Save($$) { my ($this, $idxFile) = @_; my $INDEX = IO::File->new(">$idxFile") or croak $!; print $INDEX "#FaIndex.ver:$VERSION\n"; foreach my $id (sort(keys(%{$this->{m_Idx}}))) { print $INDEX join("\t", $id, @{$this->{m_Idx}{$id}}), "\n"; } } #----------------------------------------------------------------------------- # PRE : FaIndex created, filename to load index from # POST: index loaded (previous entries, if any, NOT explicitly deleted) sub Load($$) { my ($this, $idxFile) = @_; my $INDEX = IO::File->new($idxFile) or return 0; while (defined(my $line = <$INDEX>)) { chomp $line; if ($INDEX->input_line_number() == 1) { #verify using up to date version of index! my ($idxVer) = $line =~ /ver:(\d)+/; if (!defined $idxVer || $idxVer < $VERSION) { warn("Old index.. attempting to rebuild.\n"); return 0; } } else { my @F = split /\t/, $line; $this->{m_Idx}{$F[0]} = [@F[1..$#F]]; } } return 1; } #----------------------------------------------------------------------------- # PRE : FaIndex created; id to load # POST: file mmapped; 1 if successful, 0 otherwise sub MapId($$) { my ($this, $id) = @_; if (!defined $this->{m_LoadedAcc} || $id ne $this->{m_LoadedAcc}) { if (!exists($this->{m_Idx}{$id})) { return 0 if ($_[0]->{m_Opt_UndefOutOfRange}); croak "Fasta not found for '$id'.\n"; } $this->{m_LoadedAcc} = $id; my $prevFile = $this->{m_Filename}; ($this->{m_Filename}, $this->{m_Offset}, $this->{m_eolLen}, $this->{m_LineLen}, $this->{m_faLen}) = @{$this->{m_Idx}{$id}}; if ($prevFile ne $this->{m_Filename}) { File::Map::unmap(${$this->{m_Mmap}}) if defined $this->{m_Mmap}; my $tmp; File::Map::map_file($tmp, $this->{m_Filename}); $this->{m_Mmap} = \$tmp; } } return 1; } #----------------------------------------------------------------------------- # PRE : FaIndex created # POST: array of fasta ids listed in current index sub GetIds($) { return keys(%{$_[0]->{m_Idx}}) } #----------------------------------------------------------------------------- # PRE : FaIndex created; id # POST: length of fasta for this id sub GetLength($$) { return undef if !$_[0]->MapId($_[1]); return $_[0]->{m_faLen}; } #----------------------------------------------------------------------------- # PRE : FaIndex created; id loaded; single position in 1-based coords # to load as fast as possible # POST: bases sub GetPos($$) { if ($_[1] > $_[0]->{m_faLen}) { return '-' if ($_[0]->{m_Opt_GapEndOfSeq}); return undef if ($_[0]->{m_Opt_UndefOutOfRange}); croak "Bad pos -- $_[1] exceeds sequence length ($_[0]->{m_faLen})\n"; } return substr(${$_[0]->{m_Mmap}}, $_[0]->{m_Offset} + $_[1]-1 + $_[0]->{m_eolLen}*(int(($_[1]-1)/$_[0]->{m_LineLen})), 1); } #----------------------------------------------------------------------------- # PRE : FaIndex created; id,start,stop # POST: bases sub GetSeq($$$$) { # print $_[2], "\t", $_[3], "\n"; return undef if !$_[0]->MapId($_[1]); my $stop = $_[3]; if ($stop == -1) { #flag for end of sequence $stop = $_[0]->{m_faLen}; } my $extraGaps = ''; if ($stop > $_[0]->{m_faLen} || $_[2] > $_[0]->{m_faLen}) { if ($_[0]->{m_Opt_GapEndOfSeq}) { if ($_[2] > $_[0]->{m_faLen}) { #all gaps my $seq = '-' x ($stop-$_[2]+1); return \$seq; } $extraGaps = '-' x ($stop - $_[0]->{m_faLen}); $stop = $_[0]->{m_faLen}; } else { return undef if ($_[0]->{m_Opt_UndefOutOfRange}); croak "Bad range ($_[2], $_[3]) -- exceeds sequence length ". "($_[0]->{m_faLen})\n"; } } if ($_[2] > $stop) { croak "Bad range ($_[2], $_[3]) -- start is greater than stop\n"; } if ($_[2] < 1 || $stop < 1) { croak "Bad range -- neither start ($_[2]) nor stop ($stop) can be <1\n"; } my $seq = substr(${$_[0]->{m_Mmap}}, $_[0]->{m_Offset} + $_[2]-1 + $_[0]->{m_eolLen}*(int(($_[2]-1)/$_[0]->{m_LineLen})), ($stop-$_[2]+1) + ($_[0]->{m_eolLen}*(int(($stop-1)/$_[0]->{m_LineLen}))- $_[0]->{m_eolLen}*(int(($_[2]-1)/$_[0]->{m_LineLen}))) ); $seq =~ s/\n//g; if (length($extraGaps) > 0) { $seq .= $extraGaps; } return \$seq; } #perl wants modules to return "TRUE" 1;