00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 #ifndef CGATOOLS_VARIANTS_VARIANTFILEVCFSOURCE_HPP_
00016 #define CGATOOLS_VARIANTS_VARIANTFILEVCFSOURCE_HPP_ 1
00017
00019
00020 #include "cgatools/core.hpp"
00021 #include "cgatools/conv/VcfRecordSource.hpp"
00022 #include "cgatools/reference/CrrFile.hpp"
00023 #include "cgatools/variants/SuperlocusIterator.hpp"
00024
00025 #include <deque>
00026 #include <map>
00027 #include <set>
00028 #include <vector>
00029
00030
00031 namespace cgatools { namespace cgdata {
00032 class GenomeMetadata;
00033 } }
00034
00035 namespace cgatools { namespace variants {
00036
00037
00038 class SubFieldAnnotation;
00039 struct SubFieldCall;
00040 namespace calib {
00041 class CalibratedScorer;
00042 }
00043
00044 class VariantFileVcfRecordWriter : public cgatools::conv::VcfRecordWriter
00045 {
00046 public:
00047 VariantFileVcfRecordWriter(
00048 const std::vector< boost::shared_ptr<VariantFileIterator> >& var,
00049 const std::vector< std::string> fieldNames,
00050 const cgatools::reference::CrrFile& crr,
00051 const std::string& calibPrefix);
00052
00053
00054 cgatools::reference::Location getLocation() const;
00055 void writeRef(std::ostream& out) const;
00056 void writeAlt(std::ostream& out) const;
00057 void writeInfo(std::ostream& out) const;
00058 void writeFormat(std::ostream& out) const;
00059 void writeSample(std::ostream& out, size_t gIdx) const;
00060
00061
00062
00063 struct VcfData
00064 {
00065 reference::Range range_;
00066
00067
00068
00069 std::vector<std::string> alleles_;
00070 std::vector< std::vector<size_t> > alleleIdx_;
00071 std::vector<uint32_t> ps_;
00072 std::vector< std::vector<int32_t> > hq_;
00073 std::vector< std::vector<int32_t> > ehq_;
00074 std::vector< std::vector<int32_t> > cehq_;
00075 };
00076
00077 const VcfData& getRecord() const;
00078
00079 void setLocus(
00080 const cgatools::reference::Range& range,
00081 const Superlocus& sl,
00082 std::vector<PhasedHypothesis>& hyp);
00083
00084 size_t getSampleCount() const;
00085
00086 const char* getSomaticStatusAnnotation(
00087 size_t gIdx,
00088 const std::vector< std::vector<SubFieldCall> >& calls) const;
00089
00090 void addSubFieldHeaderRecords(
00091 std::vector<cgatools::conv::VcfSubFieldHeaderRecord>& result) const;
00092
00093 size_t getAlleleCount(size_t alleleIdx) const;
00094
00095 private:
00096 struct HaploEntry
00097 {
00098 HaploEntry()
00099 : pos_(0),
00100 lastSeenLocusCount_(0),
00101 allele1_(0)
00102 {
00103 }
00104
00105 HaploEntry(uint32_t pos, uint32_t locusCount, bool allele1)
00106 : pos_(pos),
00107 lastSeenLocusCount_(locusCount),
00108 allele1_(allele1)
00109 {
00110 }
00111
00112 uint32_t pos_;
00113 uint32_t lastSeenLocusCount_;
00114 bool allele1_;
00115 };
00116
00117 void addAnnotation(std::vector< boost::shared_ptr<SubFieldAnnotation> >& anns,
00118 SubFieldAnnotation* pAnn);
00119
00120 std::string getRefColumn() const;
00121
00122 void fixN(std::string& result) const;
00123
00124 bool isEmptyAnn(const std::string& ann) const;
00125
00126 uint32_t swapIfNeeded(
00127 size_t gIdx,
00128 PhasedHypothesis& hyp,
00129 std::map< std::string, HaploEntry >& haploMap) const;
00130
00131 bool callBelongsInRecord(const Call& call) const;
00132
00133 size_t addVcfAllele(const std::string& allele);
00134
00135 void appendScores(const PhasedAllele& allele, size_t gIdx);
00136
00137 double getCalibratedScore(size_t gIdx, const Locus* locus, const Call* call, bool eaf);
00138
00139 bool isHomozygousAlt(const Locus* locus) const;
00140
00141 std::string getLocusVarType(const Locus* locus);
00142
00143 std::string sanitizeVarType(const std::string& varType);
00144
00145 std::string getVcfAllele(const Superlocus& sl,
00146 const PhasedAllele& allele);
00147
00148 private:
00149 const cgatools::reference::CrrFile& crr_;
00150 const std::vector< boost::shared_ptr<VariantFileIterator> >& var_;
00151 std::string calibPrefix_;
00152 std::vector<bool> hasCalibration_;
00153 std::map< std::string, boost::shared_ptr<calib::CalibratedScorer> > calibrations_;
00154 const Superlocus* sl_;
00155 VcfData rec_;
00156 std::vector< std::map< std::string, HaploEntry > > haploMap_;
00157 std::vector< std::vector<SubFieldCall> > infoCalls_;
00158 std::vector< std::vector< std::vector<SubFieldCall> > > gtCalls_;
00159 std::vector< boost::shared_ptr<SubFieldAnnotation> > infoAnn_;
00160 std::vector< boost::shared_ptr<SubFieldAnnotation> > gtAnn_;
00161 std::vector< std::vector<std::string> > gtAnnData_;
00162 std::vector<bool> gtAnnFlag_;
00163 std::vector< std::string > asmIds_;
00164 std::vector< std::string > asmIdSuffixes_;
00165 std::vector< bool > hasSomatic_;
00166 std::vector< std::vector<size_t> > otherIdx_;
00167 uint32_t locusCount_;
00168 };
00169
00170 class VariantFileVcfRecordSource : public cgatools::conv::VcfRecordSource
00171 {
00172 public:
00173 VariantFileVcfRecordSource(
00174 const std::vector< boost::shared_ptr<VariantFileIterator> >& var,
00175 const std::vector<std::string> fieldNames,
00176 const cgatools::reference::CrrFile& crr,
00177 const std::string& calibPrefix,
00178 bool includeNoCalls);
00179
00180
00181
00182 std::vector<cgatools::conv::VcfSubFieldHeaderRecord> getSubFieldHeaderRecords() const;
00183 std::string getSource(size_t idxGenome) const;
00184 std::vector<cgatools::conv::VcfKvHeaderRecord> getKeyValueHeaderRecords(size_t idxGenome) const;
00185 std::string getAssemblyId(size_t idxGenome) const;
00186
00187 bool eof() const;
00188 cgatools::conv::VcfRecordSource& operator++();
00189 const cgatools::conv::VcfRecordWriter& operator*() const;
00190 const cgatools::conv::VcfRecordWriter* operator->() const;
00191
00192 private:
00193 class OrderBySizeAscending
00194 {
00195 public:
00196 OrderBySizeAscending(
00197 const std::vector< std::vector<PhasedHypothesis> >& hypAll)
00198 : hypAll_(hypAll)
00199 {
00200 }
00201
00202 bool operator()(size_t lhs, size_t rhs) const
00203 {
00204 if (hypAll_[lhs].size() != hypAll_[rhs].size())
00205 return hypAll_[lhs].size() < hypAll_[rhs].size();
00206 return lhs < rhs;
00207 }
00208
00209 private:
00210 const std::vector< std::vector<PhasedHypothesis> >& hypAll_;
00211 };
00212
00213 void initSuperlocus();
00214
00215 void splitForVcf(
00216 const cgatools::variants::Superlocus& sl,
00217 std::vector<cgatools::reference::Range>& ranges) const;
00218 void addVariantRanges(
00219 std::vector<cgatools::reference::Range>& ranges,
00220 const std::pair<std::deque<cgatools::variants::Locus>::const_iterator,
00221 std::deque<cgatools::variants::Locus>::const_iterator>& loci,
00222 std::vector<uint32_t>& insPositions) const;
00223 bool isInsertionAtLeft(
00224 const std::string& rr,
00225 const std::string& aa) const;
00226 bool isInsertionAtRight(
00227 const std::string& rr,
00228 const std::string& aa) const;
00229 void addNoCallPositions(
00230 std::vector<cgatools::reference::Range>& ranges,
00231 const std::pair<std::deque<cgatools::variants::Locus>::const_iterator,
00232 std::deque<cgatools::variants::Locus>::const_iterator>& loci,
00233 const std::vector<uint32_t>& insPositions) const;
00234 void addNoCallSplits(
00235 std::vector<uint32_t>& ncSplits,
00236 const std::pair<std::deque<cgatools::variants::Locus>::const_iterator,
00237 std::deque<cgatools::variants::Locus>::const_iterator>& loci) const;
00238 void splitAndAddRange(
00239 std::vector<cgatools::reference::Range>& ranges,
00240 const cgatools::reference::Range& range,
00241 const std::vector<uint32_t>& ncSplits) const;
00242
00243 void phaseHaplotypes(
00244 const cgatools::variants::Superlocus& sl,
00245 std::vector<cgatools::variants::PhasedHypothesis>& hyp) const;
00246 int scoreHypothesis(
00247 const cgatools::variants::PhasedHypothesis& hyp,
00248 const std::set<std::string>& popAlleles) const;
00249 void addPopAlleles(
00250 const cgatools::variants::PhasedHypothesis& hyp,
00251 std::set<std::string>& popAlleles) const;
00252
00253 const cgatools::reference::CrrFile& crr_;
00254 std::vector< boost::shared_ptr<VariantFileIterator> > var_;
00255 SuperlocusIterator slIt_;
00256 boost::shared_ptr<VariantFileVcfRecordWriter> writer_;
00257 std::vector<cgatools::reference::Range> ranges_;
00258 size_t rangesIdx_;
00259 std::vector<cgatools::variants::PhasedHypothesis> hyp_;
00260 bool includeNoCalls_;
00261 };
00262
00263 } }
00264
00265 #endif // CGATOOLS_VARIANTS_VARIANTFILEVCFSOURCE_HPP_