#!/usr/bin/perl -w ########################################################################### # # Lift over single coordinates from one genome to another -- but do it # waay faster than UCSC's liftOver program # # Copyright 2010 Philip Johnson. # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published # by the Free Software Foundation, either version 3 of the License, # or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License at for # more details. # # Two requirements: # 1) query coordinates must be sorted by chromosome and position. # 2) chain file must be split into non-overlapping blocks and # sorted according to the same key as the query coords # # 24 Aug 2010: ug..added stranding to output (assume input is +) # # Input data: # [other fields optional] # Output data (remapped): # [other fields optional] # ########################################################################### use strict; use Getopt::Long; use File::Temp; my ($posFile, $chainFile, $reSort, $contigBases); $contigBases = 1; if (!GetOptions('pos=s' => \$posFile, 'chain=s' => \$chainFile, 'reSort' => \$reSort, 'contiguousBases=i' => \$contigBases) || !defined($posFile && $chainFile)) { print "usage: $0 -pos -chain [-contig ] [-reSort]\n". "Where\n". "\t-pos <> file w/ coordinates to be lifted\n". "\t-chain <> chain file must be sorted!\n". "\t-contig <> specifies a minimum number of contigous mapped bases\n". "\t centered around the coordinate to be lifted\n". "\t-reSort sorts the positions to be lifted, then resorts the output\n". "\t back to the original order (i.e. potentially slow)\n". "\n". "Input data (must be sorted or specify -reSort; coordinates are 1-based): [other fields optional] Output data (mapped on new genome): [other fields propagated through] -or- Unmappable: \n"; exit 1; } $contigBases = int($contigBases / 2); #convert to +/- window my $OUTFILE = \*STDOUT; if (defined $reSort) { SortPosfile($posFile); $OUTFILE = new File::Temp; } my ($tName, $tSize, $tStrand, $tStart, $tEnd, $qName, $qSize, $qStrand, $qStart, $qEnd); my ($size, $dt, $dq); open POS, $posFile or die $!; open CHAIN, $chainFile or die $!; my ($prevAcc, $prevPos) = ('', 0); $tName = ''; $tStart = 0; while (defined($_ = )) { chomp; my ($fromAcc, $fromPos, $remainder) = split /\s+/, $_, 3; --$fromPos; #convert to zero-based if ($fromAcc lt $prevAcc || #protect against forgetfulness ($fromAcc eq $prevAcc && $fromPos < $prevPos)) { die "Input file not sorted! fastLift requires sorted input.\n"; } $prevPos = $fromPos; $prevAcc = $fromAcc; while (!eof(CHAIN) && ($tName lt $fromAcc || ($tName eq $fromAcc && $tEnd <= $fromPos)) && LoadNextBlock()) { #print "$tName, $fromAcc; $fromPos, $tStart, $tEnd\n"; } if ($tName ne $fromAcc || $fromPos < $tStart+$contigBases || $fromPos > $tEnd-$contigBases) { print "Unmappable:\t$fromAcc\t", $fromPos+1, "\n"; next; } my $toAcc = $qName; my $toPos = ($qStrand eq '+') ? $qStart + ($fromPos - $tStart): $qSize - ($qStart + ($fromPos - $tStart)) - 1; ++$toPos; #convert to one-based $remainder = (defined $remainder) ? "\t".$remainder : ''; print $OUTFILE join("\t", $toAcc, $toPos, $qStrand), $remainder, "\n"; } close CHAIN; close POS; if (defined $reSort) { ResortOutfile($OUTFILE); } sub LoadNextBlock { my $l = ; while (defined $l && $l =~ /^$/) { #skip blank lines separating chains $l = ; } return 0 if !defined $l; if ($l =~ /^c/) { (undef, undef, $tName, $tSize, $tStrand, $tStart, $tEnd, $qName, $qSize, $qStrand, $qStart, $qEnd) = split /\s+/, $l; $l = ; } else { $tStart += $size + $dt; $qStart += $size + $dq; } ($size, $dt, $dq) = split /\s+/, $l; $tEnd = $tStart + $size; $qEnd = $qStart + $size; return 1; } #------------------------------------------------------------------------------ sub SortPosfile { my ($origFile) = @_; my $tmpfile = new File::Temp; system(q*perl -ne 'chomp; print "$_\t$.\n"' *." $origFile | sort -k1,1 -k2n,2 > $tmpfile") == 0 or die; $_[0] = $tmpfile; } sub ResortOutfile { my ($outFile) = @_; my $cmd = q*perl -nae 'print scalar(@F)' *.$outFile; my $field = `$cmd`; system("sort -k${field}n,$field $outFile | ".q*perl -nae 'print join("\t", @F[0..($#F-1)]), "\n"'*) == 0 or die; }