parsing.cpp
1 #include <cmath>
2 #include "parsing.h"
3 
4 namespace BioLCCC
5 {
6 ParsingException::ParsingException(std::string message):
7  BioLCCCException(message)
8 {
9 };
10 
11 std::vector<double> calculateMonomerEnergyProfile(
12  const std::vector<ChemicalGroup> &parsedSequence,
13  const ChemicalBasis & chemBasis,
14  const double secondSolventConcentration,
15  const double columnRelativeStrength,
16  const double temperature) throw (BioLCCCException)
17 {
18  if (parsedSequence.size() < 3)
19  {
20  throw BioLCCCException(
21  "The parsed sequence contains too little chemical groups.");
22  }
23 
24  if (columnRelativeStrength == 0.0)
25  {
26  return std::vector<double> (parsedSequence.size()-2, 0.0);
27  }
28 
29  // Due to the preliminary scaling the binding energy of water equals zero.
30  double Q = exp(columnRelativeStrength *
31  (chemBasis.secondSolventBindEnergy() - 0.0) *
32  293.0 / temperature);
33  // Nb = (DensityB * %B / MB) /
34  // (DensityB * %B / MB + DensityA * %A / MA)
35  // where DensityA and DensityB are the corresponding densities and
36  // MA and MB are the corresponding molecular weights.
37  double Nb =
38  secondSolventConcentration * chemBasis.secondSolventDensity()
39  / chemBasis.secondSolventAverageMass()
40  / ( secondSolventConcentration * chemBasis.secondSolventDensity()
41  / chemBasis.secondSolventAverageMass()
42  + (100.0 - secondSolventConcentration)
43  * chemBasis.firstSolventDensity()
44  / chemBasis.firstSolventAverageMass());
45 
46  double Eab = 0.0;
47  if (chemBasis.snyderApproximation())
48  {
49  Eab = Nb * chemBasis.secondSolventBindEnergy();
50  }
51  else
52  {
53  Eab = 0.0 + 1.0 / columnRelativeStrength * log( 1.0 - Nb + Nb * Q );
54  }
55 
56  std::vector<double> monomerEnergyProfile;
57  for (std::vector<BioLCCC::ChemicalGroup>::const_iterator residue =
58  ++(parsedSequence.begin());
59  residue != --(parsedSequence.end());
60  ++residue)
61  {
62  double residueEnergy = residue->bindEnergy();
63  double residueArea = residue->bindArea();
64 
65  // Adding the energy of the N-terminal group to the first residue.
66  if (residue == ++(parsedSequence.begin()))
67  {
68  residueEnergy += parsedSequence.begin()->bindEnergy();
69  residueArea += parsedSequence.begin()->bindArea();
70  }
71 
72  // Adding the energy of the C-terminal group to the last residue.
73  else if (residue == --(--(parsedSequence.end())))
74  {
75  residueEnergy += (--(parsedSequence.end()))->bindEnergy();
76  residueArea += (--(parsedSequence.end()))->bindArea();
77  }
78 
79  monomerEnergyProfile.push_back(
80  columnRelativeStrength *
81  //(residueEnergy - Eab) * 293.0 / temperature);
82  (residueEnergy - residueArea * Eab) * 293.0 / temperature);
83  }
84  return monomerEnergyProfile;
85 }
86 
87 std::vector<double> calculateSegmentEnergyProfile(
88  const std::vector<double> &monomerEnergyProfile,
89  const double monomerLength,
90  const double kuhnLength)
91 {
92  std::vector<double> segmentEnergyProfile;
93  std::vector<double>::const_iterator monomerEnergyIt =
94  monomerEnergyProfile.begin();
95  double kuhnLeftBorder = 0;
96  double monomerLeftBorder = 0;
97  double sumEnergy = 0.0;
98  bool kuhnSegmentOpen = true;
99 
100  while (monomerEnergyIt != monomerEnergyProfile.end())
101  {
102  if ((kuhnLeftBorder + kuhnLength) >=
103  (monomerLeftBorder + monomerLength))
104  {
105  sumEnergy += *(monomerEnergyIt) *
106  (monomerLeftBorder + monomerLength -
107  std::max(monomerLeftBorder, kuhnLeftBorder)) /
108  monomerLength;
109  kuhnSegmentOpen = true;
110  monomerLeftBorder += monomerLength;
111  ++monomerEnergyIt;
112  }
113  else
114  {
115  sumEnergy += *(monomerEnergyIt) *
116  (kuhnLeftBorder + kuhnLength -
117  std::max(monomerLeftBorder, kuhnLeftBorder)) /
118  monomerLength;
119  segmentEnergyProfile.push_back(sumEnergy);
120  sumEnergy = 0.0;
121  kuhnSegmentOpen = false;
122  kuhnLeftBorder += kuhnLength;
123  }
124  }
125 
126  if (kuhnSegmentOpen)
127  {
128  segmentEnergyProfile.push_back(sumEnergy);
129  }
130 
131  return segmentEnergyProfile;
132 }
133 
134 std::vector<ChemicalGroup> parseSequence(
135  const std::string &source,
136  const ChemicalBasis &chemBasis) throw(BioLCCCException)
137 {
138  std::vector<ChemicalGroup> parsedSequence;
139  ChemicalGroup NTerminus;
140  ChemicalGroup CTerminus;
141 
142  // At first we need to strip the sequence from adjacent amino acids.
143  // If a source sequence contains them, it should contain two dots, so we
144  // need the part of sequence between them.
145  std::size_t firstDotPosition = 0;
146  std::size_t secondDotPosition = 0;
147 
148  // We'll use this variable to contain peptide sequence without adjacent
149  // amino acids.
150  std::string strippedSource = source;
151 
152  firstDotPosition = source.find(".");
153 
154  if (firstDotPosition != std::string::npos)
155  {
156  secondDotPosition = source.find(".", firstDotPosition+1);
157  if (secondDotPosition != std::string::npos)
158  {
159 
160  // If a source sequence contains more that two dots, it's broken.
161  if (source.find(".", secondDotPosition+1) != std::string::npos)
162  {
163  throw ParsingException(
164  "The sequence " + source +" contains more than two dots.");
165  }
166  else
167  {
168  strippedSource = source.substr(firstDotPosition+1,
169  secondDotPosition - firstDotPosition - 1);
170  }
171  }
172  // If a source sequence contains only one dot, it's broken.
173  else
174  {
175  throw ParsingException(
176  "The sequence " + source + " contains only one dot.");
177  }
178  }
179 
180  // Than goes parsing.
181  std::size_t NTerminusPosition = 0;
182 
183  // First we need to check the stripped source sequence for
184  // the N-Terminal group.
185  NTerminus = chemBasis.defaultNTerminus();
186  for (std::map<std::string,ChemicalGroup>::const_iterator
187  NTerminusIterator = chemBasis.chemicalGroups().begin();
188  NTerminusIterator != chemBasis.chemicalGroups().end();
189  NTerminusIterator++)
190  {
191  if (NTerminusIterator->second.isNTerminal())
192  {
193  if (strippedSource.find(NTerminusIterator->second.label()) ==
194  (size_t)0)
195  {
196  NTerminus = NTerminusIterator->second;
197  NTerminusPosition = NTerminusIterator->second.label().size();
198  }
199  }
200  }
201 
202  // Then we need to found the location of the C-Terminus.
203  CTerminus = chemBasis.defaultCTerminus();
204  std::size_t CTerminusPosition;
205  CTerminusPosition = strippedSource.find("-", NTerminusPosition);
206  if (CTerminusPosition != std::string::npos)
207  {
208  // The sequence should not contain hyphens after C-terminal group.
209  if (strippedSource.find("-", CTerminusPosition+1) != std::string::npos)
210  {
211  throw ParsingException(
212  "The sequence " + source +
213  " contains hyphen after C-terminal group.");
214  }
215 
216  // Searching for known C-terminal groups.
217  bool CTerminusFound = false;
218  for (std::map<std::string,ChemicalGroup>::const_iterator
219  CTerminusIterator = chemBasis.chemicalGroups().begin();
220  CTerminusIterator != chemBasis.chemicalGroups().end();
221  CTerminusIterator++)
222  {
223  if (CTerminusIterator->second.isCTerminal())
224  {
225  if (strippedSource.find(CTerminusIterator->second.label(),
226  CTerminusPosition) != std::string::npos)
227  {
228  CTerminus = CTerminusIterator->second;
229  CTerminusFound = true;
230  }
231  }
232  }
233  if (!CTerminusFound)
234  {
235  throw ParsingException(
236  "The sequence " + source +
237  " contains unknown C-terminal group\"" +
238  source.substr(CTerminusPosition) + "\".");
239  }
240  }
241  else
242  {
243  CTerminusPosition = strippedSource.size();
244  }
245 
246  // At this step we obtain the sequence of a peptide without adjacent
247  // amino acids and terminal groups.
248  strippedSource = strippedSource.substr(
249  NTerminusPosition, CTerminusPosition-NTerminusPosition);
250 
251  // We need to check whether it contains any non-letter characters.
252  for (std::size_t i=0; i<strippedSource.size(); i++)
253  {
254  if (!(((int(strippedSource[i]) >= int('a')) &&
255  (int(strippedSource[i]) <= int('z'))) ||
256  ((int(strippedSource[i]) >= int('A')) &&
257  (int(strippedSource[i]) <= int('Z')))))
258  {
259  throw ParsingException(
260  "The sequence " + source +
261  " contains a non-letter character.");
262  }
263  }
264 
265  // Then we divide the whole sequence into aminoacids.
266  bool aminoAcidFound;
267  size_t curPos = 0;
268  while (curPos < strippedSource.size())
269  {
270  aminoAcidFound = false;
271  for (std::map<std::string,ChemicalGroup>::const_iterator
272  aminoAcidIterator = chemBasis.chemicalGroups().begin();
273  aminoAcidIterator != chemBasis.chemicalGroups().end();
274  aminoAcidIterator++)
275  {
276  if (strippedSource.compare(curPos,
277  aminoAcidIterator->second.label().size(),
278  aminoAcidIterator->second.label()) == 0)
279  {
280  curPos += aminoAcidIterator->second.label().size();
281  parsedSequence.push_back(aminoAcidIterator->second);
282  aminoAcidFound = true;
283  break;
284  }
285  }
286 
287  if (!aminoAcidFound)
288  {
289  throw ParsingException(
290  "The sequence " + source + " contains unknown amino acid \"" +
291  source.substr(curPos, 1) + "\".");
292  }
293  }
294  parsedSequence.insert(parsedSequence.begin(), NTerminus);
295  parsedSequence.push_back(CTerminus);
296  return parsedSequence;
297 }
298 }
299 
An instance of ChemicalBasis contains a set of BioLCCC constants.
Definition: chemicalbasis.h:86
std::vector< ChemicalGroup > parseSequence(const std::string &source, const ChemicalBasis &chemBasis)
Parses a given peptide sequence.
Definition: parsing.cpp:134
Apart from classes, BioLCCC contains calculation methods and constants.
Definition: auxiliary.h:4
std::vector< double > calculateMonomerEnergyProfile(const std::vector< ChemicalGroup > &parsedSequence, const ChemicalBasis &chemBasis, const double secondSolventConcentration, const double columnRelativeStrength, const double temperature)
Calculates the effective energy profile of monomers of the polymer chain.
Definition: parsing.cpp:11
Base class for all BioLCCC exceptions. Can be used by itself.
ParsingException(std::string message)
Constructs an instance ParsingException with the given message.
Definition: parsing.cpp:6
A ChemicalGroup instance contains the properties of a group of atoms.
Definition: chemicalgroup.h:20
std::vector< double > calculateSegmentEnergyProfile(const std::vector< double > &monomerEnergyProfile, const double monomerLength, const double kuhnLength)
Calculates the effective energy profile of segments of the polymer chain.
Definition: parsing.cpp:87
This exception is raised when a parsing process cannot be completed.
Definition: parsing.h:11
std::string label() const
Returns the brief code of the group used in peptide sequence notation.