Commit c9b43f2d authored by karius's avatar karius

initial commit

parents
/*
* CBlastPdbInputSource.cpp
*
* Created on: Feb 19, 2019
* Author: kkarius
*/
#include <CBlastPdbInputSource.h>
#include <corelib/ncbistre.hpp>
#include <iostream>
#include <string>
CBlastPdbInputSource::CBlastPdbInputSource(const string& filename) : m_InputReader(*new CNcbiIfstream(filename, ios::binary)) {
fMapLoaded = x_initAAMap();
} // @suppress("Member declaration not found")
ncbi::CRef<ncbi::objects::CSeq_loc> CBlastPdbInputSource::x_PdbToSeqLoc(
ncbi::CScope& scope) {
//
// CRef < CSeq_entry > seq_entry(m_InputReader->ReadOneSeq());
// _ASSERT(seq_entry.NotEmpty());
// scope.AddTopLevelSeqEntry(*seq_entry);
//
// CRef < CSeq_loc > retval(new CSeq_loc());
//
// // Get the sequence length
// const TSeqPos seqlen = seq_entry->GetSeq().GetInst().GetLength();
//
// // set sequence range
// retval->SetInt().SetFrom(from);
// retval->SetInt().SetTo((to > 0 && to < seqlen) ? to : (seqlen - 1));
//
// // set ID
// retval->SetInt().SetId().Assign(
// *FindBestChoice(itr->GetId(), CSeq_id::BestRank));
//
// return retval;
}
ncbi::CTempString CBlastPdbInputSource::x_getAA(ncbi::CTempString line) {
if (fMapLoaded) {
//according to documentation in column 18-20
std::map<std::string, std::string>::iterator it = m_AACodes321.find(
line.substr(17, 3));
ncbi::CTempString aa;
if (it != m_AACodes321.end()) {
aa = it->second;
} else {
return UNKNOWN_AA;
}
}
return ncbi::CTempString();
}
bool CBlastPdbInputSource::x_initAAMap() {
try {
ncbi::CRowReader < ncbi::TRowReaderStream_SingleCommaDelimited
> aamap_reader("aa_codes_321.tsv");
for (const auto & row : aamap_reader) {
m_AACodes321[row[0].GetOriginalData()] = row[1].GetOriginalData();
}
} catch (const std::exception& e) {
//generate Warning
return false;
}
return true;
}
CBlastPdbInputSource::~CBlastPdbInputSource() {
}
ncbi::blast::SSeqLoc CBlastPdbInputSource::GetNextSSeqLoc(ncbi::CScope& scope) {
}
ncbi::CRef<ncbi::blast::CBlastSearchQuery> CBlastPdbInputSource::GetNextSequence(
ncbi::CScope& scope) {
}
bool CBlastPdbInputSource::End() {
return true;
}
/*
* CBlastPdbInputSource.h
*
* Created on: Feb 19, 2019
* Author: kkarius
*/
#ifndef CBLASTPDBINPUTSOURCE_H_
#define CBLASTPDBINPUTSOURCE_H_
#include <algo/blast/blastinput/blast_input.hpp>
#include <corelib/ncbistre.hpp>
#include <CPDBSeq.h>
#include <CPdbLineReader.h>
#include <util/row_reader_char_delimited.hpp>
#include <Test.h>
class CBlastPdbInputSource: public ncbi::blast::CBlastInputSource {
public:
CBlastPdbInputSource(const string& filename);
// CBlastPdbInputSource(ncbi::CNcbiIstream& infile,const ncbi::blast::CBlastInputSourceConfig& iconfig);
virtual ~CBlastPdbInputSource();
protected:
ncbi::blast::SSeqLoc GetNextSSeqLoc(ncbi::CScope& scope);
ncbi::CRef<ncbi::blast::CBlastSearchQuery> GetNextSequence(
ncbi::CScope& scope);
bool End();
private:
// ncbi::blast::CBlastInputSourceConfig m_Config; ///< Configuration for the sequences to be read
// ncbi::CRef<ncbi::ILineReader> m_LineReader; ///< interface to read lines
/// Reader of PDB sequences or identifiers
CPdbLineReader m_InputReader;
ncbi::CTempString x_getAA(ncbi::CTempString line);
bool x_initAAMap(void);
std::map<std::string, std::string> m_AACodes321;
bool fMapLoaded;
ncbi::CRef<ncbi::objects::CSeq_loc> x_PdbToSeqLoc(ncbi::CScope& scope);
/// Read a single sequence from file and convert to a Seq_loc
/// @param lcase_mask A Seq_loc that describes the
/// lowercase-masked regions in the query that was read in.
/// If there are no such locations, the Seq_loc is of type
/// 'null', otherwise it is of type 'packed_seqint' [out]
/// @param scope CScope object to which the read sequence is added [in]
/// @return The sequence in Seq_loc format
///
// ncbi::CRef<ncbi::objects::CSeq_loc>
// x_FastaToSeqLoc(ncbi::CRef<ncbi::objects::CSeq_loc>& lcase_mask, ncbi::CScope& scope);
/// Initialization method for the input reader
// void x_InitInputReader();
};
#endif /* CBLASTPDBINPUTSOURCE_H_ */
/*
* CPDBSeq.cpp
*
* Created on: Feb 14, 2019
* Author: kkarius
*/
#include "CPDBSeq.h"
#include <string>
#include <iostream>
#include <util/row_reader_char_delimited.hpp>
#include <corelib/ncbistr.hpp>
#include <corelib/ncbistr_util.hpp>
CPDBSeq::CPDBSeq() {
}
CPDBSeq::~CPDBSeq() {
// TODO Auto-generated destructor stub
}
void CPDBSeq::setPDBFilePath(const string& filename) { // @suppress("Member declaration not found")
CRowReader<CRowReaderStream_Base> pdb_reader(filename);
initAAMap();
std::string last_chain;
int last_residue = -1;
std::map<string, string> chain_to_sequence;
for (const auto & row : pdb_reader) {
Record record = getRecord(row.GetOriginalData().data());
if (record.type == ATOM or record.type == HETATM) {
record.residue_id = aa_codes_321[record.residue_id];
//initialize chain and residue index
if (last_chain.empty()) {
last_chain = record.chain_id;
}
//initialize chain id in chain-to-seq map
if (!chain_to_sequence.count(last_chain)) {
chain_to_sequence[last_chain] = record.residue_id;
} else {
if (record.residue_index != last_residue) {
if (record.residue_index == last_residue + 1) {
chain_to_sequence[last_chain].append(record.residue_id);
} else if (last_chain == record.chain_id) {
fillGaps(chain_to_sequence[last_chain],
record.residue_index - last_residue - 1);
chain_to_sequence[last_chain].append(record.residue_id);
}
}
}
last_residue = record.residue_index;
last_chain = record.chain_id;
}
}
for (auto const &ent1 : chain_to_sequence) {
// ent1.first is the first key
std::cout << ent1.first << " " << ent1.second << endl;
}
}
bool CPDBSeq::initAAMap() {
CRowReader<TRowReaderStream_SingleCommaDelimited> aamap_reader(
"aa_codes_321.tsv");
for (const auto & row : aamap_reader) {
aa_codes_321[row[0].GetOriginalData()] = row[1].GetOriginalData();
}
return true;
}
void CPDBSeq::fillGaps(string& sequence, int length) {
for (int i = 0; i < length; i++) {
sequence.append(GAP_SYMBOL);
}
}
Record CPDBSeq::getRecord(string line) {
Record record;
record.type = getType(line);
//TODO Check if ATOM2-9 are important
if (record.type == ATOM) { // or record.type == HETATM) {
record.residue_id = line.substr(17, 3);
record.chain_id = line.substr(21, 1);
record.residue_index = NStr::StringToInt(
NStr::TruncateSpaces(line.substr(22, 4)));
}
return record;
}
//STOLEN from Chimera
RecordType CPDBSeq::getType(string line) {
char rt[7]; // PDB record type
int i;
for (i = 0; line[i] != '\0' && line[i] != '\n' && i < 6; i += 1) {
if (islower(line[i]))
rt[i] = _toupper(line[i]);
else
rt[i] = line[i];
}
if (i < 6)
for (; i < 6; i += 1)
rt[i] = ' ';
rt[6] = '\0';
switch (rt[0]) {
case 'A':
switch (rt[1]) {
case 'N':
if (strcmp(rt + 2, "ISOU") == 0)
return ANISOU;
break;
case 'T':
if (strcmp(rt + 2, "OM ") == 0)
return ATOM;
if (rt[4] == ' ' && rt[5] >= '1' && rt[5] <= '9')
return (RecordType) (ATOM + (rt[5] - '0'));
break;
case 'U':
if (strcmp(rt + 2, "THOR") == 0)
return AUTHOR;
break;
}
break;
case 'C':
switch (rt[1]) {
case 'A':
if (strcmp(rt + 2, "VEAT") == 0)
return CAVEAT;
break;
case 'I':
if (strcmp(rt + 2, "SPEP") == 0)
return CISPEP;
break;
case 'O':
if (strcmp(rt + 2, "MPND") == 0)
return COMPND;
if (strcmp(rt + 2, "NECT") == 0)
return CONECT;
break;
case 'R':
if (strcmp(rt + 2, "YST1") == 0)
return CRYST1;
break;
}
break;
case 'D':
if (strcmp(rt + 1, "BREF ") == 0)
return DBREF;
if (strcmp(rt + 1, "BREF1") == 0)
return DBREF1;
if (strcmp(rt + 1, "BREF2") == 0)
return DBREF2;
break;
case 'E':
switch (rt[1]) {
case 'N':
if (strcmp(rt + 2, "D ") == 0)
return END;
if (strcmp(rt + 2, "DMDL") == 0)
return ENDMDL;
break;
case 'X':
if (strcmp(rt + 2, "PDTA") == 0)
return EXPDTA;
break;
}
break;
case 'F':
switch (rt[1]) {
case 'T':
if (strcmp(rt + 2, "NOTE") == 0)
return FTNOTE;
break;
case 'O':
if (strcmp(rt + 2, "RMUL") == 0)
return FORMUL;
break;
}
break;
case 'H':
switch (rt[1]) {
case 'E':
if (strcmp(rt + 2, "TATM") == 0)
return HETATM;
if (strcmp(rt + 2, "ADER") == 0)
return HEADER;
if (strcmp(rt + 2, "T ") == 0)
return HET;
if (strcmp(rt + 2, "TNAM") == 0)
return HETNAM;
if (strcmp(rt + 2, "TSYN") == 0)
return HETSYN;
if (strcmp(rt + 2, "LIX ") == 0)
return HELIX;
break;
case 'Y':
if (strcmp(rt + 2, "DBND") == 0)
return HYDBND;
break;
}
break;
case 'J':
if (strcmp(rt + 1, "RNL ") == 0)
return JRNL;
break;
case 'K':
if (strcmp(rt + 1, "EYWDS") == 0)
return KEYWDS;
break;
case 'L':
if (strcmp(rt + 1, "INK ") == 0)
return LINK;
break;
case 'M':
switch (rt[1]) {
case 'A':
if (strcmp(rt + 2, "STER") == 0)
return MASTER;
break;
case 'D':
if (strcmp(rt + 2, "LTYP") == 0)
return MDLTYP;
break;
case 'O':
if (strcmp(rt + 2, "DEL ") == 0)
return MODEL;
if (strcmp(rt + 2, "DRES") == 0)
return MODRES;
break;
case 'T':
if (strcmp(rt + 2, "RIX1") == 0 || strcmp(rt + 2, "RIX2") == 0
|| strcmp(rt + 2, "RIX3") == 0)
return MTRIX;
break;
}
break;
case 'N':
switch (rt[1]) {
case 'U':
if (strcmp(rt + 2, "MMDL") == 0)
return NUMMDL;
break;
}
break;
case 'O':
switch (rt[1]) {
case 'B':
if (strcmp(rt + 2, "SLTE") == 0)
return OBSLTE;
break;
case 'R':
if (strcmp(rt + 2, "IGX1") == 0 || strcmp(rt + 2, "IGX2") == 0
|| strcmp(rt + 2, "IGX3") == 0)
return ORIGX;
break;
}
break;
case 'R':
if (rt[1] != 'E')
break;
if (strcmp(rt + 2, "MARK") == 0)
return REMARK;
if (strcmp(rt + 2, "VDAT") == 0)
return REVDAT;
break;
case 'S':
switch (rt[1]) {
case 'C':
if (strcmp(rt + 2, "ALE1") == 0 || strcmp(rt + 2, "ALE2") == 0
|| strcmp(rt + 2, "ALE3") == 0)
return SCALE;
break;
case 'E':
if (strcmp(rt + 2, "QRES") == 0)
return SEQRES;
if (strcmp(rt + 2, "QADV") == 0)
return SEQADV;
break;
case 'H':
if (strcmp(rt + 2, "EET ") == 0)
return SHEET;
break;
case 'I':
if (strcmp(rt + 2, "TE ") == 0)
return SITE;
if (strcmp(rt + 2, "GATM") == 0)
return SIGATM;
if (strcmp(rt + 2, "GUIJ") == 0)
return SIGUIJ;
break;
case 'L':
if (strcmp(rt + 2, "TBRG") == 0)
return SLTBRG;
break;
case 'O':
if (strcmp(rt + 2, "URCE") == 0)
return SOURCE;
break;
case 'P':
if (strcmp(rt + 2, "RSDE") == 0)
return SPRSDE;
if (strcmp(rt + 2, "LIT ") == 0)
return SPLIT;
break;
case 'S':
if (strcmp(rt + 2, "BOND") == 0)
return SSBOND;
break;
}
break;
case 'T':
switch (rt[1]) {
case 'E':
if (strcmp(rt + 2, "R ") == 0)
return TER;
break;
case 'I':
if (strcmp(rt + 2, "TLE ") == 0)
return TITLE;
break;
case 'O':
if (strcmp(rt + 2, "RSDO") == 0)
return END;
break;
case 'U':
if (strcmp(rt + 2, "RN ") == 0)
return TURN;
break;
case 'V':
if (strcmp(rt + 2, "ECT ") == 0)
return TVECT;
break;
}
break;
// like most of this function, not really necessary
// case 'U':
// if (rt[1] == 'S' && rt[2] == 'E' && rt[3] == 'R')
// switch (pdbrunInputVersion) {
// case 1: case 2: case 3: case 4: case 5:
// return pdbrun5Type(line + 6);
// case 6: case 7:
// return pdbrun6Type(line + 6);
// default:
// // bootstrap by recognizing PDBRUN
// if (strncasecmp(line + 6, "PDBRUN ", 7) == 0)
// return USER_PDBRUN;
// return USER;
// }
// break;
}
return UNKNOWN;
}
/*
* CPDBSeq.h
*
* Created on: Feb 14, 2019
* Author: kkarius
*/
#ifndef CPDBSEQ_H_
#define CPDBSEQ_H_
#include <corelib/ncbiobj.hpp>
#include <util/row_reader_ncbi_tsv.hpp>
#include <util/row_reader_char_delimited.hpp>
#include <algo/blast/api/sseqloc.hpp>
#include <string>
USING_NCBI_SCOPE;
USING_SCOPE(blast);
//copied from PDB.h of chimera implementation
//enum RecordType { UNKNOWN,
// ANISOU, ATOM, ATOM1, ATOM2, ATOM3, ATOM4, ATOM5, ATOM6, ATOM7,
// ATOM8, ATOM9, ATOMQR, AUTHOR,
// CAVEAT, CISPEP, COMPND, CONECT, CRYST1,
// DBREF, DBREF1, DBREF2, END, ENDMDL, EXPDTA, FORMUL, FTNOTE,
// HEADER, HELIX, HET, HETATM, HETNAM, HETSYN, HYDBND,
// JRNL, KEYWDS, LINK,
// MASTER, MDLTYP, MODEL, MODRES, MTRIX, NUMMDL,
// OBSLTE, ORIGX, REMARK, REVDAT,
// SCALE, SEQADV, SEQRES, SHEET, SIGATM, SIGUIJ, SITE, SLTBRG,
// SOURCE, SPLIT, SPRSDE, SSBOND,
// TER, TITLE, TURN, TVECT,
// USER,
// USER_PDBRUN, USER_EYEPOS, USER_ATPOS, USER_WINDOW, USER_FOCUS,
// USER_VIEWPORT, USER_BGCOLOR, USER_ANGLE, USER_DISTANCE,
// USER_FILE, USER_MARKNAME, USER_MARK, USER_CNAME, USER_COLOR,
// USER_RADIUS, USER_OBJECT, USER_ENDOBJ, USER_CHAIN,
// USER_GFX_BEGIN, USER_GFX_END, USER_GFX_COLOR, USER_GFX_NORMAL,
// USER_GFX_VERTEX, USER_GFX_FONT, USER_GFX_TEXTPOS,
// USER_GFX_LABEL,
// USER_GFX_MOVE, USER_GFX_DRAW, USER_GFX_MARKER, USER_GFX_POINT
// };
//
////"X" for reasons of blast documentation
//static const string GAP_SYMBOL="X";
//struct Record {
// RecordType type;
// string chain_id;
// int residue_index;
// string residue_id;
//};
class CPDBSeq: public ncbi::CObject {
public:
CPDBSeq();
virtual ~CPDBSeq();
void setPDBFilePath(const string& filename);
TSeqLocVector getSeqLocs();
private:
TSeqLocVector sequences;
map<string,string> aa_codes_321;
void parse();
void fillGaps(string&,int);
bool initAAMap();
// RecordType getType(string line);
// Record getRecord(string line);
};
#endif /* CPDBSEQ_H_ */
/*
* CPdbLineReader.cpp
*
* Created on: Feb 20, 2019
* Author: kkarius
*/
#include <CPdbLineReader.h>
#include <iostream>
std::string CPdbLineReader::TRANSLATE_AA_321(
std::string three_letter_aa_code) {
try {
return AA_321_MAP.at(three_letter_aa_code);
} catch (const std::exception& e) {
std::cout << "Gtfo, organ bank." << ncbi::Endl();
return UNKNOWN_AA;
}
}
CPdbLineReader::CPdbLineReader(ncbi::CNcbiIstream& in) :
ncbi::CStreamLineReader(in, eTakeOwnership) {
}
CPdbLineReader::~CPdbLineReader() {
}
bool CPdbLineReader::firstValid(){
return m_firstValid;
}
CPdbLineReader& CPdbLineReader::operator ++() {
if ( m_UngetLine ) {
m_UngetLine = false;
return *this;
}
this->x_findNextResidue();
m_firstValid = true;
return *this;
}
unsigned int CPdbLineReader::getResidueNumber(ncbi::CTempString line) {
//residue number per documentation column 22-26 and an int, left filled with spaces
ncbi::CTempString tentativeResidue = ncbi::NStr::TruncateSpaces_Unsafe(
line.substr(22, 5));
try {
unsigned int residueNumber = ncbi::NStr::StringToInt(tentativeResidue);
return residueNumber;
} catch (const std::exception& e) {
return 0;
}
}
std::string CPdbLineReader::getResidueId(ncbi::CTempString line) {
std::string line_s(line);
//residue id per documentation column 18-20 and a three letter code
return line_s.substr(17, 3);
}
unsigned int CPdbLineReader::getAtomNumber(ncbi::CTempString line) {
std::string line_s(line);
//atom number per documentation column 7-11 and an int, left filled with spaces
ncbi::CTempString tentativeAtom = ncbi::NStr::TruncateSpaces_Unsafe(
line_s.substr(6, 5));
try {
unsigned int atomNumber(ncbi::NStr::StringToInt(tentativeAtom));
return atomNumber;
} catch (const std::exception& e) {
return 0;
}
}
std::string CPdbLineReader::getChaindId(ncbi::CTempString line) {
//chain id per documentation column 22 and an ASCII character
return line.substr(21, 1);
}
void CPdbLineReader::x_findNextResidue() {
if (!ncbi::CStreamLineReader::AtEOF()) {
m_LineNumberPrejump = m_LineNumber;
unsigned int currentResidue = m_currentResidue;
unsigned int newResidueNumber = currentResidue;
while (currentResidue == newResidueNumber
&& !ncbi::CStreamLineReader::AtEOF()) {
ncbi::CStreamLineReader::operator ++();
if (x_typeIsValid(x_getType(**this))) {
newResidueNumber = CPdbLineReader::getResidueNumber(**this);
}
}
m_currentResidue = newResidueNumber;
}
}
void CPdbLineReader::UngetLine(void)
{
_ASSERT(m_Pos != m_Line.begin());
_ASSERT(m_Line.begin() != 0);
/* If after UngetLine() or after constructor - noop */
if (m_Pos == m_Line.begin() || m_Line.begin() == 0) {
return;
}
m_LineNumber=m_LineNumberPrejump;
m_Pos = m_Line.begin();
}
bool CPdbLineReader::x_typeIsValid(RecordType type) {
if (type == ATOM || type == HETATM) {
return true;
}
return false;
}
//STOLEN from Chimera
RecordType CPdbLineReader::x_getType(ncbi::CTempString line) {
char rt[7]; // PDB record type
int i;
for (i = 0; line[i] != '\0' && line[i] != '\n' && i < 6; i += 1) {
if (islower(line[i]))
rt[i] = _toupper(line[i]);
else
rt[i] = line[i];
}
if (i < 6)
for (; i < 6; i += 1)
rt[i] = ' ';
rt[6] = '\0';
switch (rt[0]) {
case 'A':
switch (rt[1]) {
case 'N':
if (strcmp(rt + 2, "ISOU") == 0)
return ANISOU;
break;
case 'T':
if (strcmp(rt + 2, "OM ") == 0)
return ATOM;
if (rt[4] == ' ' && rt[5] >= '1' && rt[5] <= '9')
return (RecordType) (ATOM + (rt[5] - '0'));
break;
case 'U':
if (strcmp(rt + 2, "THOR") == 0)
return AUTHOR;
break;
}
break;