00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 #ifndef CGATOOLS_REFERENCE_RANGEANNOTATIONSTORE_HPP_
00016 #define CGATOOLS_REFERENCE_RANGEANNOTATIONSTORE_HPP_ 1
00017
00019
00020 #include "cgatools/core.hpp"
00021
00022 #include <string>
00023 #include <vector>
00024 #include <boost/shared_ptr.hpp>
00025
00026 #include "cgatools/util/Streams.hpp"
00027 #include "cgatools/util/DelimitedFile.hpp"
00028 #include "cgatools/util/DelimitedLineParser.hpp"
00029 #include "cgatools/util/RangeIntersector.hpp"
00030 #include "cgatools/reference/CrrFile.hpp"
00031 #include "cgatools/reference/ChromosomeIdField.hpp"
00032
00033 using cgatools::util::InputStream;
00034
00035 namespace cgatools { namespace reference {
00036
00037
00043 template <typename Derived, typename TValue>
00044 class RangeAnnotationStore
00045 {
00046 private:
00047 public:
00048 typedef util::IntervalTree<reference::Range,
00049 reference::Location,
00050 TValue,
00051 reference::RangeOverlap,
00052 reference::GetRangeBoundary > DataStore;
00053 typedef typename DataStore::QueryResultType QueryResultType;
00054
00055 RangeAnnotationStore(const reference::CrrFile& crr)
00056 : crr_(crr)
00057 {}
00058
00060 const util::DelimitedFileMetadata& getMetadata() const
00061 {
00062 return metadata_;
00063 }
00064
00067 void intersect(const reference::Range& range,
00068 std::vector<QueryResultType>& result) const
00069 {
00070 data_.intersect(range, result);
00071 }
00072
00073 #if 0
00076 void bindColumns(util::DelimitedFile& df, reference::Range& range, TValue& data)
00077 #endif
00078
00079 protected:
00080 const reference::CrrFile& crr_;
00081 DataStore data_;
00082
00085 typedef RangeAnnotationStore<Derived, TValue> Base;
00086
00088 void load(const std::string& fn, char delimiter = '\t')
00089 {
00090 boost::shared_ptr<std::istream> in =
00091 InputStream::openCompressedInputStreamByExtension(fn);
00092 util::DelimitedFile df(*in, fn, delimiter);
00093 metadata_ = df.getMetadata();
00094
00095 reference::Range range;
00096 TValue payload;
00097 static_cast<Derived*>(this)->bindColumns(df, range, payload);
00098
00099 while (df.next())
00100 data_.put(range, payload);
00101 }
00102
00103
00107 void bindRangeColumns(util::DelimitedFile& df,
00108 reference::Range& range,
00109 const std::string& chrColName = "chromosome",
00110 const std::string& begColName = "begin",
00111 const std::string& endColName = "end")
00112 {
00113 df.addField(ChromosomeIdField(chrColName, &range.chromosome_, crr_));
00114 df.addField(util::ValueField<uint32_t>(begColName, &range.begin_));
00115 df.addField(util::ValueField<uint32_t>(endColName, &range.end_));
00116 }
00117
00118
00120 template <typename Value>
00121 class MidpointField : public util::DelimitedFieldParser
00122 {
00123 public:
00124 MidpointField(const std::string& name, Value* begin, Value* end)
00125 : DelimitedFieldParser(name),
00126 begin_(begin),
00127 end_(end)
00128 {
00129 }
00130
00131 void parse(const char* first, const char* last)
00132 {
00133 *begin_ = util::parseValue<Value>(first, last);
00134 *end_ = *begin_;
00135 }
00136
00137 private:
00138 uint32_t* begin_;
00139 uint32_t* end_;
00140 };
00141
00145 void bindRangeMidpointColumns(util::DelimitedFile& df,
00146 reference::Range& range,
00147 const std::string& chrColName = "chromosome",
00148 const std::string& midpointColName = "position")
00149 {
00150 df.addField(ChromosomeIdField(chrColName, &range.chromosome_, crr_));
00151 df.addField(MidpointField<uint32_t>(midpointColName, &range.begin_, &range.end_));
00152 }
00153
00154
00155 private:
00156 util::DelimitedFileMetadata metadata_;
00157 };
00158
00159 }}
00160
00161 #endif