00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 #ifndef CGA_TOOLS_JUNCTION_COMPARE_HPP_
00016 #define CGA_TOOLS_JUNCTION_COMPARE_HPP_
00017
00029
00030
00031 #include "cgatools/core.hpp"
00032 #include "cgatools/junctions/Junction.hpp"
00033 #include "cgatools/util/GenericHistogram.hpp"
00034
00035
00036 #include <string>
00037 #include <vector>
00038 #include <map>
00039
00040
00041 namespace cgatools { namespace junctions {
00042
00043 class JunctionComparatorWithinContig
00044 {
00045 public:
00046 static const char SEPARATOR = '\t';
00047
00048 typedef std::vector<size_t> JunctionIndices;
00049 typedef std::pair<int,int> JunctionGroupId;
00050 typedef std::map< JunctionGroupId, JunctionIndices > JunctionGroups;
00051
00052 JunctionComparatorWithinContig(
00053 const ReferenceGenome& reference,
00054 JunctionFile& file1,
00055 JunctionFile& file2);
00056
00057 void compareWithinContig(
00058 const JunctionFile& file1,
00059 const JunctionFile& file2,
00060 const JunctionIndices& indexes1,
00061 const JunctionIndices& indexes2,
00062 JunctionFile& result1and2,
00063 JunctionFile& result1not2,
00064 std::ostream &report
00065 ) const;
00066
00067 void compare();
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077 const JunctionFile &getResult1and2() const { return result1and2_; }
00078 const JunctionFile &getResult1not2() const { return result1not2_; }
00079 const JunctionFile &getResult2and1() const { return result2and1_; }
00080 const JunctionFile &getResult2not1() const { return result2not1_; }
00081
00082 protected:
00083
00084 void collectJunctionGroups(JunctionFile &junctFile, JunctionGroups &result);
00085
00086 size_t distanceTolerance_;
00087
00088 const ReferenceGenome& reference_;
00089
00090 JunctionFile& file1_;
00091 JunctionFile& file2_;
00092
00093 JunctionGroups junctionGroups1_;
00094 JunctionGroups junctionGroups2_;
00095
00096 JunctionFile result1and2_;
00097 JunctionFile result1not2_;
00098 JunctionFile result2and1_;
00099 JunctionFile result2not1_;
00100 };
00101
00102 #define ADD_FNAME(names, N) names[N] = #N
00103
00104 class JunctionCounterFields {
00105 public:
00106 enum Fields {
00107 TOTAL_JUNCTIONS = 0,
00108 LEFT_SIDES_EQUAL,
00109 RIGHT_SIDES_EQUAL,
00110 BOTH_SIDES_EQUAL,
00111 LEFT_UNKNOWN,
00112 RIGHT_UNKNOWN,
00113 STRAND_BOTH_POSITIVE,
00114 STRAND_LEFT_NEGATIVE,
00115 STRAND_RIGHT_NEGATIVE,
00116 STRAND_BOTH_NEGATIVE,
00117
00118 NUMBER_OF_FIELDS
00119 };
00120 typedef boost::array<std::string,NUMBER_OF_FIELDS> Names;
00121
00122 static Names getFieldNames()
00123 {
00124 Names names;
00125 ADD_FNAME(names, TOTAL_JUNCTIONS);
00126 ADD_FNAME(names, LEFT_SIDES_EQUAL);
00127 ADD_FNAME(names, RIGHT_SIDES_EQUAL);
00128 ADD_FNAME(names, BOTH_SIDES_EQUAL);
00129 ADD_FNAME(names, LEFT_UNKNOWN);
00130 ADD_FNAME(names, RIGHT_UNKNOWN);
00131 ADD_FNAME(names, STRAND_BOTH_POSITIVE);
00132 ADD_FNAME(names, STRAND_LEFT_NEGATIVE);
00133 ADD_FNAME(names, STRAND_RIGHT_NEGATIVE);
00134 ADD_FNAME(names, STRAND_BOTH_NEGATIVE);
00135 return names;
00136 }
00137 };
00138
00139
00140 template <class Fields, class Type>
00141 class CounterSet
00142 {
00143 public:
00144 CounterSet() {
00145 values_.assign(Type(0));
00146 }
00147
00148 void print(std::ostream &out) const {
00149 typename Fields::Names fieldNames = Fields::getFieldNames();
00150 for (size_t i=0; i<values_.size(); ++i)
00151 out << fieldNames[i] << '\t' << values_[i] << std::endl;
00152 }
00153
00154 Type &operator[] (size_t i) {CGA_ASSERT_L(i,values_.size()); return values_[i];}
00155 boost::array<Type,Fields::NUMBER_OF_FIELDS> values_;
00156 };
00157
00158 class JunctionIt
00159 {
00160 public:
00161 typedef Junctions::const_iterator It;
00162 JunctionIt() :it_(nullValue_.begin()) {}
00163 JunctionIt(It it) :it_(it) {}
00164
00165 const Junction& operator* () const {return *it_;}
00166 bool operator< (const JunctionIt& other) {
00167 CGA_ERROR_EX("JunctionIt::operator< is disabled"); return it_<other.it_;
00168 }
00169
00170 It it_;
00171 std::vector<JunctionIt> matchedJunctionIts_;
00172
00173 bool isNull() const {return it_->id_==nullValue_.front().id_ && matchedJunctionIts_.empty();}
00174 public:
00175 static Junctions nullValue_;
00176
00177 friend std::ostream& operator<<(std::ostream &ostr, const JunctionIt::It& j);
00178 friend std::ostream& operator<<(std::ostream &ostr, const JunctionIt& j);
00179 };
00180
00181
00182
00183 class CompareBySide {
00184 public:
00185 CompareBySide(int firstSide=JUNCTION_LEFT_SIDE)
00186 :firstSide_(firstSide),
00187 secondSide_(JUNCTION_RIGHT_SIDE-firstSide)
00188 {}
00189 static Location getLocation(const Junction& j, int side) {
00190 Location l = j.sideSections_[side].position_;
00191
00192
00193 return l;
00194 }
00195 bool operator() (const Junction& j1,const Junction& j2) const {
00196
00197
00198
00199 Location l1 = getLocation(j1, firstSide_);
00200 Location l2 = getLocation(j2, firstSide_);
00201 if (l1!=l2)
00202 return l1<l2;
00203 return getLocation(j1, secondSide_) < getLocation(j2, secondSide_);
00204 }
00205 bool operator() (JunctionIt::It j1,JunctionIt::It j2) const {
00206 return operator() (*j1,*j2);
00207 }
00208 bool operator() (const JunctionIt &j1,const JunctionIt &j2) const {
00209 return operator() (*j1,*j2);
00210 }
00211 protected:
00212 int firstSide_;
00213 int secondSide_;
00214 };
00215
00216 class JuctionStatisticsCollector
00217 {
00218 public:
00219
00220 JuctionStatisticsCollector(uint32_t distanceTolerance)
00221 :distanceTolerance_(distanceTolerance),out_(NULL)
00222 {
00223 size_t buckets[7][3] =
00224 {
00225 {0,101,10},
00226 {200, 1001, 100},
00227 {2000,10001,1000},
00228 {20000, 100001, 10000},
00229 {2000000, 10000001, 1000000},
00230 {20000000,100000001, 10000000},
00231 {200000000,1000000001,100000000}
00232 };
00233
00234 scoreHistogram_.addRange(0,20,1);
00235 scoreHistogram_.addRange(20,200,10);
00236
00237 for (size_t i=0; i<6; ++i)
00238 oneSideNeighbourDist_.addRange(buckets[i][0],buckets[i][1],buckets[i][2]);
00239
00240 searchDistanceTolerance_ = distanceTolerance*5;
00241 twoSideNeighbourDist_.addRange(0,searchDistanceTolerance_+10,10);
00242
00243 for (size_t i=0; i<4; ++i)
00244 clusterSizeInBases_.addRange(buckets[i][0],buckets[i][1],buckets[i][2]);
00245
00246 clusterSizeInJunctions_.addRange(0,15,1);
00247 }
00248
00249 void findOneSideNeighbours(const Junctions& junctions, int firstSide);
00250 void findFullNeighbours(const Junctions& junctions);
00251 void run(const Junctions& junctions, std::ostream &out);
00252 protected:
00253 uint32_t distanceTolerance_;
00254 uint32_t searchDistanceTolerance_;
00255 CounterSet<JunctionCounterFields,int64_t> counters_;
00256 util::GenericHistogram<size_t,size_t> scoreHistogram_;
00257 util::GenericHistogram<size_t,size_t> oneSideNeighbourDist_;
00258 util::GenericHistogram<size_t,size_t> twoSideNeighbourDist_;
00259
00260 util::GenericHistogram<size_t,size_t> clusterSizeInBases_;
00261 util::GenericHistogram<size_t,size_t> clusterSizeInJunctions_;
00262 std::ostream *out_;
00263 };
00264
00265
00266 class JunctionIts : public std::vector<JunctionIt>
00267 {
00268 public:
00269 JunctionIts() : std::vector<JunctionIt>() {}
00270
00271 JunctionIts(const Junctions& junctions, const std::string &id)
00272 : std::vector<JunctionIt>()
00273 {
00274 addJunctions(junctions, id);
00275 }
00276
00277 void addJunctions(const Junctions& junctions, const std::string &id) {
00278 reserve(size()+junctions.size());
00279 for (Junctions::const_iterator it=junctions.begin(),endIt=junctions.end(); it!=endIt;++it) {
00280 push_back(it);
00281 }
00282 }
00283 };
00284
00285 class FullGenomeJunctionComparator
00286 {
00287 public:
00288 typedef std::vector<Junctions> JunctionsArray;
00289 typedef std::vector<JunctionIts> JunctionsItsArray;
00290
00291
00292 FullGenomeJunctionComparator(
00293 uint32_t distanceTolerance,
00294 const std::vector<size_t> &scoreThresholds,
00295 std::ostream &outCommonJunctionsStream
00296 )
00297 : distanceTolerance_(distanceTolerance),
00298 scoreThresholds_(scoreThresholds),
00299 outCommonJunctionsStream_(outCommonJunctionsStream)
00300 {}
00301
00302 static const char SEPARATOR = '\t';
00303
00304
00305
00306
00307
00308 double getCompatibility(const Junction& j0, const Junction& j1) const {
00309 double result = 0;
00310 uint64_t totDistance = 0;
00311 if (size_t(j0.score_.score_+1E-5) >= scoreThresholds_[0] &&
00312 size_t(j1.score_.score_+1E-5) > scoreThresholds_[1])
00313 {
00314 for (size_t i=size_t(JUNCTION_LEFT_SIDE); i<=size_t(JUNCTION_RIGHT_SIDE); ++i) {
00315 uint32_t dist = std::abs(CompareBySide::getLocation(j0,i).distanceTo(
00316 CompareBySide::getLocation(j1,i)));
00317 if (dist < distanceTolerance_
00318 && j0.sideSections_[i].strand_ == j1.sideSections_[i].strand_)
00319 ++result;
00320 totDistance+=dist*dist;
00321 }
00322 result+= 10/(sqrt((double)totDistance)+20);
00323
00324 }
00325 return result;
00326 }
00327
00328 bool isSideCompatible(const Junction& j0, const Junction& j1, int side) const {
00329 return (uint32_t)std::abs(CompareBySide::getLocation(j0,side).distanceTo(
00330 CompareBySide::getLocation(j1,side))) < distanceTolerance_
00331 && j0.sideSections_[side].strand_ == j1.sideSections_[side].strand_
00332 && size_t(j0.score_.score_+1E-5) >= scoreThresholds_[0]
00333 && size_t(j1.score_.score_+1E-5) > scoreThresholds_[1];
00334 }
00335
00336 void compare(const JunctionsArray& junctions) const;
00337
00338 void compareBySide(const JunctionsArray& junctions, int firstSide,
00339 JunctionIts &oneSideCompatible,
00340 JunctionIts &twoSidesCompatible,
00341 JunctionIts &twoSidesIncompatible) const;
00342 void mergeResults(JunctionsItsArray &input, JunctionIts &output) const;
00343 void printResults(const JunctionIts& junctionIts, std::ostream & out) const;
00344 protected:
00345 uint32_t distanceTolerance_;
00346 std::vector<size_t> scoreThresholds_;
00347 std::ostream &outCommonJunctionsStream_;
00348 };
00349
00350 }}
00351
00352
00353 #endif //CGA_TOOLS_JUNCTION_COMPARE_HPP_
00354