00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 #ifndef CGATOOLS_VARIANTS_VARIANTFILEVCFSOURCE_HPP_
00016 #define CGATOOLS_VARIANTS_VARIANTFILEVCFSOURCE_HPP_ 1
00017
00019
00020 #include "cgatools/core.hpp"
00021 #include "cgatools/conv/VcfRecordSource.hpp"
00022 #include "cgatools/reference/CrrFile.hpp"
00023 #include "cgatools/variants/SuperlocusIterator.hpp"
00024
00025 #include <deque>
00026 #include <map>
00027 #include <set>
00028 #include <vector>
00029
00030
00031 namespace cgatools { namespace cgdata {
00032 class GenomeMetadata;
00033 } }
00034
00035 namespace cgatools { namespace variants {
00036
00037
00038 class SubFieldAnnotation;
00039 struct SubFieldCall;
00040 namespace calib {
00041 class CalibratedScorer;
00042 }
00043
00044 class VariantFileVcfRecordWriter : public cgatools::conv::VcfRecordWriter
00045 {
00046 public:
00047 VariantFileVcfRecordWriter(
00048 const std::vector< boost::shared_ptr<VariantFileIterator> >& var,
00049 const std::vector< std::string>& fieldNames,
00050 const cgatools::reference::CrrFile& crr,
00051 const std::string& calibPrefix);
00052
00053
00054 cgatools::reference::Location getLocation() const;
00055 void writeRef(std::ostream& out) const;
00056 void writeAlt(std::ostream& out) const;
00057 void writeInfo(std::ostream& out) const;
00058 void writeFormat(std::ostream& out) const;
00059 void writeSample(std::ostream& out, size_t gIdx) const;
00060
00061
00062
00063 struct VcfData
00064 {
00065 reference::Range range_;
00066
00067
00068
00069 std::vector<std::string> alleles_;
00070 std::vector< std::vector<size_t> > alleleIdx_;
00071 std::vector<uint32_t> ps_;
00072 std::vector< std::vector<int32_t> > hq_;
00073 std::vector< std::vector<int32_t> > ehq_;
00074 std::vector< std::vector<int32_t> > cehq_;
00075 };
00076
00077 const VcfData& getRecord() const;
00078
00079 const cgatools::reference::CrrFile& getCrr() const
00080 {
00081 return crr_;
00082 }
00083
00084 void setLocus(
00085 const cgatools::reference::Range& range,
00086 const Superlocus& sl,
00087 std::vector<PhasedHypothesis>& hyp);
00088
00089 size_t getSampleCount() const;
00090
00091 const char* getSomaticStatusAnnotation(
00092 size_t gIdx,
00093 const std::vector< std::vector<SubFieldCall> >& calls) const;
00094
00095 void addSubFieldHeaderRecords(
00096 std::vector<cgatools::conv::VcfSubFieldHeaderRecord>& result) const;
00097
00098 size_t getAlleleCount(size_t alleleIdx) const;
00099
00100 private:
00101 struct HaploEntry
00102 {
00103 HaploEntry()
00104 : pos_(0),
00105 lastSeenLocusCount_(0),
00106 allele1_(0)
00107 {
00108 }
00109
00110 HaploEntry(uint32_t pos, uint32_t locusCount, bool allele1)
00111 : pos_(pos),
00112 lastSeenLocusCount_(locusCount),
00113 allele1_(allele1)
00114 {
00115 }
00116
00117 uint32_t pos_;
00118 uint32_t lastSeenLocusCount_;
00119 bool allele1_;
00120 };
00121
00122 void addAnnotation(std::vector< boost::shared_ptr<SubFieldAnnotation> >& anns,
00123 SubFieldAnnotation* pAnn);
00124
00125 std::string getRefColumn() const;
00126
00127 void fixN(std::string& result) const;
00128
00129 bool isEmptyAnn(const std::string& ann) const;
00130
00131 uint32_t swapIfNeeded(
00132 size_t gIdx,
00133 PhasedHypothesis& hyp,
00134 std::map< std::string, HaploEntry >& haploMap) const;
00135
00136 bool callBelongsInRecord(const Call& call) const;
00137
00138 size_t addVcfAllele(const std::string& allele);
00139
00140 void appendScores(const PhasedAllele& allele, size_t gIdx);
00141
00142 double getCalibratedScore(size_t gIdx, const Locus* locus, const Call* call, bool eaf);
00143
00144 bool isHomozygousAlt(const Locus* locus) const;
00145
00146 std::string getLocusVarType(const Locus* locus);
00147
00148 std::string sanitizeVarType(const std::string& varType);
00149
00150 std::string getVcfAllele(const Superlocus& sl,
00151 const PhasedAllele& allele);
00152
00153 private:
00154 const cgatools::reference::CrrFile& crr_;
00155 const std::vector< boost::shared_ptr<VariantFileIterator> >& var_;
00156 std::string calibPrefix_;
00157 std::vector<bool> hasCalibration_;
00158 std::map< std::string, boost::shared_ptr<calib::CalibratedScorer> > calibrations_;
00159 const Superlocus* sl_;
00160 VcfData rec_;
00161 std::vector< std::map< std::string, HaploEntry > > haploMap_;
00162 std::vector< std::vector<SubFieldCall> > infoCalls_;
00163 std::vector< std::vector< std::vector<SubFieldCall> > > gtCalls_;
00164 std::vector< boost::shared_ptr<SubFieldAnnotation> > infoAnn_;
00165 std::vector< boost::shared_ptr<SubFieldAnnotation> > gtAnn_;
00166 std::vector< std::vector<std::string> > gtAnnData_;
00167 std::vector<bool> gtAnnFlag_;
00168 std::vector< std::string > asmIds_;
00169 std::vector< std::string > asmIdSuffixes_;
00170 std::vector< bool > hasSomatic_;
00171 std::vector< std::vector<size_t> > otherIdx_;
00172 uint32_t locusCount_;
00173 };
00174
00175 class VariantFileVcfRecordSource : public cgatools::conv::VcfRecordSource
00176 {
00177 public:
00178 VariantFileVcfRecordSource(
00179 const std::vector< boost::shared_ptr<VariantFileIterator> >& var,
00180 const std::vector<std::string>& fieldNames,
00181 const cgatools::reference::CrrFile& crr,
00182 const std::string& calibPrefix,
00183 bool includeNoCalls);
00184
00185
00186
00187 std::vector<cgatools::conv::VcfSubFieldHeaderRecord> getSubFieldHeaderRecords() const;
00188 std::string getSource(size_t idxGenome) const;
00189 std::vector<cgatools::conv::VcfKvHeaderRecord> getKeyValueHeaderRecords(size_t idxGenome) const;
00190 std::string getAssemblyId(size_t idxGenome) const;
00191
00192 bool eof() const;
00193 cgatools::conv::VcfRecordSource& operator++();
00194 const cgatools::conv::VcfRecordWriter& operator*() const;
00195 const cgatools::conv::VcfRecordWriter* operator->() const;
00196
00197 private:
00198 class OrderBySizeAscending
00199 {
00200 public:
00201 OrderBySizeAscending(
00202 const std::vector< std::vector<PhasedHypothesis> >& hypAll)
00203 : hypAll_(hypAll)
00204 {
00205 }
00206
00207 bool operator()(size_t lhs, size_t rhs) const
00208 {
00209 if (hypAll_[lhs].size() != hypAll_[rhs].size())
00210 return hypAll_[lhs].size() < hypAll_[rhs].size();
00211 return lhs < rhs;
00212 }
00213
00214 private:
00215 const std::vector< std::vector<PhasedHypothesis> >& hypAll_;
00216 };
00217
00218 void initSuperlocus();
00219
00220 void splitForVcf(
00221 const cgatools::variants::Superlocus& sl,
00222 std::vector<cgatools::reference::Range>& ranges) const;
00223 void addVariantRanges(
00224 std::vector<cgatools::reference::Range>& ranges,
00225 const std::pair<std::deque<cgatools::variants::Locus>::const_iterator,
00226 std::deque<cgatools::variants::Locus>::const_iterator>& loci,
00227 std::vector<uint32_t>& insPositions) const;
00228 bool isInsertionAtLeft(
00229 const std::string& rr,
00230 const std::string& aa) const;
00231 bool isInsertionAtRight(
00232 const std::string& rr,
00233 const std::string& aa) const;
00234 void addNoCallPositions(
00235 std::vector<cgatools::reference::Range>& ranges,
00236 const std::pair<std::deque<cgatools::variants::Locus>::const_iterator,
00237 std::deque<cgatools::variants::Locus>::const_iterator>& loci,
00238 const std::vector<uint32_t>& insPositions) const;
00239 void addNoCallSplits(
00240 std::vector<uint32_t>& ncSplits,
00241 const std::pair<std::deque<cgatools::variants::Locus>::const_iterator,
00242 std::deque<cgatools::variants::Locus>::const_iterator>& loci) const;
00243 void splitAndAddRange(
00244 std::vector<cgatools::reference::Range>& ranges,
00245 const cgatools::reference::Range& range,
00246 const std::vector<uint32_t>& ncSplits) const;
00247
00248 void phaseHaplotypes(
00249 const cgatools::variants::Superlocus& sl,
00250 std::vector<cgatools::variants::PhasedHypothesis>& hyp) const;
00251 int scoreHypothesis(
00252 const cgatools::variants::PhasedHypothesis& hyp,
00253 const std::set<std::string>& popAlleles) const;
00254 void addPopAlleles(
00255 const cgatools::variants::PhasedHypothesis& hyp,
00256 std::set<std::string>& popAlleles) const;
00257
00258 const cgatools::reference::CrrFile& crr_;
00259 std::vector< boost::shared_ptr<VariantFileIterator> > var_;
00260 SuperlocusIterator slIt_;
00261 boost::shared_ptr<VariantFileVcfRecordWriter> writer_;
00262 std::vector<cgatools::reference::Range> ranges_;
00263 size_t rangesIdx_;
00264 std::vector<cgatools::variants::PhasedHypothesis> hyp_;
00265 bool includeNoCalls_;
00266 };
00267
00268 } }
00269
00270 #endif // CGATOOLS_VARIANTS_VARIANTFILEVCFSOURCE_HPP_