Package csb :: Package bio :: Package fragments :: Module isites
[frames] | no frames]

Source Code for Module csb.bio.fragments.isites

   1  """ 
   2  I-Sites fragment library APIs.  
   3   
   4  @warning: This module is no longer developed and is provided as is.  
   5  """ 
   6   
   7  import sys 
   8  import collections 
   9  import operator 
  10  import math  
  11   
  12  import csb.io 
  13  import csb.core 
  14  import csb.bio.fragments 
  15  import csb.bio.structure as structure 
  16  import csb.bio.sequence 
  17  import csb.bio.hmm as hmm 
18 19 20 -class FragmentMatching(csb.core.enum):
21 """ 22 Enumeration of fragment assignment methods 23 """ 24 Sequence=0; Bystroff=1; CrossEntropy=2; Soeding=3; Baker=4
25
26 -class InternalISitesError(ValueError):
27 pass
28 -class InvalidParadigmError(InternalISitesError):
29 pass
30
31 -class ChainSequence(object):
32 """ 33 Represents a single PDB chain sequence entry with torsion angles 34 """ 35
36 - def __init__(self):
37 self.id = None 38 self.accession = None 39 self.chain = None 40 self.type = None 41 self.header = None 42 self.sequence = None 43 self.torsion = structure.TorsionAnglesCollection()
44 45 @property
46 - def has_torsion(self):
47 if hasattr(self, 'torsion'): 48 return len(self.torsion) > 0 49 else: 50 return False
51 52 @property
53 - def entry_id(self):
54 id = self.accession 55 if self.id is not None: 56 id += self.id 57 return id
58 59 @property
60 - def length(self):
61 if self.sequence is None: 62 return 0 63 return len(self.sequence)
64
65 -class ProteinProfile(object):
66 """ 67 Describes an I-Sites protein profile/PSSM. 68 69 @param background: background amino acid frequencies 70 @type background: dict 71 @param matrix: initialization array with target frequencies 72 @type matrix: dict 73 @param alpha: fixed "pseudocount" number for this cluster/profile 74 @type alpha: float 75 """ 76 77 BackgroundFreqs = { 78 'A': 0.077049999999999993, 'C': 0.016140000000000002, 'D': 0.058470000000000001, 79 'E': 0.065640000000000004, 'F': 0.041090000000000002, 'G': 0.072179999999999994, 80 'H': 0.023630000000000002, 'I': 0.058400000000000001, 'L': 0.089560000000000001, 81 'K': 0.061460000000000001, 'M': 0.021680000000000001, 'N': 0.045400000000000003, 82 'P': 0.045179999999999998, 'Q': 0.037080000000000002, 'R': 0.049790000000000001, 83 'S': 0.061870000000000001, 'T': 0.055660000000000001, 'V': 0.069790000000000005, 84 'W': 0.014319999999999999, 'Y': 0.035610000000000003, 85 86 'B': 0.051935000000000002, 'Z': 0.051360000000000003, 'X': 1.000000000000000000, 87 88 'O': sys.float_info.min, 'U': sys.float_info.min 89 } 90
91 - def __init__(self, background=BackgroundFreqs, matrix=None, alpha=0.2):
92 93 self._matrix = [ ] 94 self._pssm = [ ] 95 self._maxscore = 0 96 self.background = background 97 self.alpha = float(alpha) 98 99 if matrix: 100 for col in matrix: 101 self.add_column(**col)
102 103
104 - def add_column(self, **target_frequencies):
105 """ 106 Append a new column to the profile. 107 108 Keyword arguments are used to pass the individual amino acid target frequencies, 109 e.g. A=0.12, B=0.0, etc. If an amino acid is omitted, its probability will be 110 set automatically to 0. 111 112 @param target_frequencies: amino acid frequencies in that column 113 @type target_frequencies: dict 114 115 @return: the index of the new column 116 @rtype: int 117 118 @raise KeyError: if the target frequencies dict contains an unknown amino acid 119 with respect to C{profile.background} 120 """ 121 if not set(target_frequencies.keys()).issubset(self.background): 122 raise KeyError('Unrecognized residue name(s)') 123 124 profcol = { } 125 pssmcol = { } 126 127 for aa in self.background: 128 129 assert self.background[aa] > 0 130 131 if aa in target_frequencies: 132 assert target_frequencies[aa] >= 0 133 profcol[aa] = target_frequencies[aa] 134 else: 135 profcol[aa] = 0.0 136 137 Qa = self.background[aa] 138 Pja = profcol[aa] 139 alpha = self.alpha 140 141 pssmcol[aa] = math.log( (Pja + alpha * Qa) / (Qa * (1 + alpha)) ) 142 143 self._matrix.append(profcol) 144 self._pssm.append(pssmcol) 145 self._maxscore += max(pssmcol.values()) 146 147 return len(self._matrix) - 1
148 149 @property
150 - def length(self):
151 return len(self._matrix)
152 153 @property
154 - def matrix(self):
155 return self._matrix
156 157 @property
158 - def pssm(self):
159 return self._pssm
160 161 @property
162 - def max_score(self):
163 return self._maxscore
164
165 -class ISitesFragmentMatch(csb.bio.fragments.FragmentMatch):
166
167 - def __init__(self, fragment, qstart, qend, probability, rmsd, tm_score, qlength):
168 169 self._source = fragment.representative.accession + fragment.representative.chain 170 self._start = fragment.representative.structure.residues[1].rank 171 self._end = fragment.representative.structure.residues[-1].rank 172 assert (self._end - self._start + 1) == fragment.length == (qend - qstart + 1) 173 174 super(ISitesFragmentMatch, self).__init__(fragment.id, qstart, qend, probability, 175 rmsd, tm_score, qlength)
176 177 @property
178 - def source_id(self):
179 return self._source
180 181 @property
182 - def start(self):
183 return self._start
184 185 @property
186 - def end(self):
187 return self._end
188
189 -class ProfileMatch(object):
190 """ 191 Describes a profile-query match 192 193 @param id: id of the match 194 @type id: str 195 @param start: start position of the match 196 @type start: int 197 @param score: score 198 @type score: float 199 @param rmsd: RMSD between the query and the representative structure of the 200 fragment withing the matching region 201 @type rmsd: float 202 @param rmsd_dih: circular (torsion) RMSD between the query and the representative 203 structure of the fragment withing the matching region 204 @type rmsd_dih: float 205 @param tm_score: TM-Score between the query and the representative 206 structure of the fragment withing the matching region 207 @type tm_score: float 208 @param tm_scoreout: TM-Score, but calculated with local structural alignment 209 @type tm_scoreout: float 210 """ 211
212 - def __init__(self, id, start, score, rmsd=None, rmsd_dih=None, tm_score=None, tm_scoreout=None):
213 self.id = id 214 self.start = start 215 self.score = score 216 self.rmsd = rmsd 217 self.rmsd_dih = rmsd_dih 218 self.tm_score = tm_score 219 self.tm_scoreout = tm_scoreout
220
221 - def __repr__(self):
222 return "<ProfileMatch: {0.id} at {0.start}, s={0.score}, rms={0.rmsd}, tm={0.tm_score},{0.tm_scoreout} >".format(self)
223
224 - def __lt__(self, other):
225 return self.score < other.score
226
227 - def confidence(self, b0, b1, b2, b3=1.0):
228 """ 229 Convert the raw profile score to probability. 230 231 @param b0: C{cluster.linearfit[0]} 232 @type b0: float 233 @param b1: C{cluster.linearfit[1]} 234 @type b1: float 235 @param b2: C{cluster.linearfit[2]} 236 @type b2: float 237 @param b3: C{cluster.linearfit[3]} 238 @type b3: float 239 240 @return: probability for the hit being a true positive 241 @rtype: float 242 """ 243 244 if self.score is None: 245 return 0 246 return ((1 / math.pi) * math.atan(b0 + b1 * self.score) + b2) * b3
247
248 -class ProfileMatches(object):
249 """ 250 A collection of profile-database hits. 251 252 @param size: target maximum size of the hitlist 253 @type size: int 254 @param cutoff: inclusion score cutoff 255 @type cutoff: float 256 @param rmsd_cutoff: inclusion RMSD cutoff 257 @type rmsd_cutoff: float 258 259 @raise ValueError: on invalid hitlist C{size} 260 """ 261
262 - def __init__(self, size=20, cutoff=0.5, rmsd_cutoff=0.5):
263 if not size > 0: 264 raise ValueError('The maximum size of the matches table must be a positive number.') 265 266 self._matches = [ ] 267 self.list_size = int(size) 268 self.cutoff = float(cutoff) 269 self.rmsd_cutoff = float(rmsd_cutoff) 270 271 self._sorted = True
272
273 - def _sort(self):
274 if not self._sorted: 275 self._matches.sort(key=operator.attrgetter('score'), reverse=True) 276 self._sorted = True
277
278 - def add(self, id, start, score, rmsd=None):
279 """ 280 Add a new match to the collection of hits if its score satisfies the cutoff. 281 282 @param id: id of the match 283 @type id: str 284 @param start: start position of the match 285 @type start: int 286 @param score: score 287 @type score: float 288 @param rmsd: RMSD between the query and the representative structure of the 289 fragment withing the matching region 290 @type rmsd: float 291 292 @raise value_error: on invalid start position 293 """ 294 if not start > 0: 295 raise ValueError('Match start position must be greater than zero.') 296 score = float(score) 297 start = int(start) 298 299 self._sorted = False 300 301 if len(self._matches) < self.list_size: 302 if score >= self.cutoff: #or rmsd <= self.rmsd_cutoff: 303 match = ProfileMatch(id, start, score, rmsd) 304 self._matches.append(match) 305 # sort is useless if not all slots are full, leave it for the first time the data is accessed through self.hits 306 else: 307 best_match = self._matches[-1] 308 if score > best_match.score: #or rmsd < best_match.rmsd: 309 self._matches.pop() 310 match = ProfileMatch(id, start, score, rmsd) 311 self._matches.append(match) 312 self._sort() # sort is needed in real time if all hit slots are already occupied
313 314 @property
315 - def length(self):
316 return len(self._matches)
317 318 @property
319 - def hits(self):
320 if self.length > 0: 321 self._sort() 322 return self._matches 323 else: 324 return None
325 326 @property
327 - def best_match(self):
328 if self.length > 0: 329 return self.hits[0] 330 else: 331 return None
332
333 -class Hitlist(object):
334 """ 335 I-Site instance hitlist. 336 337 @param query: the query fragment 338 @type query: L{Cluster} 339 @param hits: a list of L{ProfileMatch}es 340 @type hits: list 341 """ 342
343 - def __init__(self, query, hits):
344 345 self.query = query 346 self.hits = hits
347
348 - def __getitem__(self, i):
349 return self.hits[i]
350
351 - def __iter__(self):
352 return iter(self.hits)
353
354 - def best_hit(self):
355 """ 356 @return: the best hit in the list in terms of profile score 357 @rtype: L{ProfileMatch} 358 """ 359 best = None 360 for hit in self.hits: 361 if best is None: 362 best = hit 363 else: 364 if best.score < hit.score or (best.score == hit.score and best.rmsd > hit.rmsd): 365 best = hit 366 return best
367
368 - def top(self, n):
369 """ 370 @param n: number of top hits 371 @type n: int 372 373 @return: top C{n} hits in terms of profile score 374 @rtype: list of L{ProfileMatch} 375 """ 376 377 hits = list(self.hits) 378 hits.sort() 379 return hits[-n:]
380
381 -class Library(object):
382 """ 383 Representation of an I-Sites peptide fragment library. 384 Provides dictionary-like access to the underlying L{Cluster} objects by 385 either C{cluster.id} or C{cluster.representative.accesion}. 386 """ 387
388 - def __init__(self):
389 390 self.name = 'ISITES' 391 self.version = None 392 self.centroids = None 393 self.documentation = None 394 395 self._clusters = None 396 self._ondemand = False 397 self._byid = None 398 self._byrep = None 399 400 self.__iscfiles = []
401
402 - def __getitem__(self, item):
403 if self._ondemand: 404 raise AttributeError("ID/rep-based access to clusters is not available in Delay-parsed libraries.") 405 406 if isinstance(item, csb.core.string): 407 return self._byrep[item] 408 else: 409 return self._byid[item]
410
411 - def __contains__(self, key):
412 try: 413 self[key] 414 return True 415 except KeyError: 416 return False
417 418 @property
419 - def clusters(self):
420 return self._clusters
421 422 @clusters.setter
423 - def clusters(self, clusters):
424 if clusters is not None: 425 if not isinstance(clusters, collections.Iterable): 426 clusters = [clusters] 427 # if not isinstance(clusters, collections.Iterator): 428 # for c in clusters: 429 # assert isinstance(c, Cluster), "The I-Sites Library is constructed of Cluster objects." 430 431 self._clusters = clusters 432 if isinstance(clusters, collections.Iterator): 433 self._ondemand = True 434 else: 435 self._ondemand = False 436 self._byid = { } 437 self._byrep = { } 438 439 for c in clusters: 440 self._byid[c.id] = c 441 442 accession = c.representative.accession[:4] 443 444 if accession not in self._byrep: 445 self._byrep[accession] = [ ] 446 self._byrep[accession].append(c)
447
448 - def compact(self):
449 """ 450 Compact the library by excluding unimportant information: 451 452 - documentation 453 - clusters: file, program, covariance 454 455 @raise AttributeError: on attempt to compact a delay-parsed library 456 """ 457 if self._ondemand: 458 raise AttributeError("Delay-parsed libraries cannot be compacted.") 459 460 self.documentation = None 461 for c in self.clusters: 462 c.file = None 463 c.program = None 464 c.covariance = None
465
466 - def serialize(self, file):
467 """ 468 Serialize the library to a file. 469 470 @param file: output file name 471 @type file: str 472 473 @raise AttributeError: on attempt to serialize a delay-parsed library 474 """ 475 if self._ondemand: 476 raise AttributeError("Delay-parsed libraries cannot be serialized.") 477 478 with open(file, 'wb') as output: 479 csb.io.Pickle.dump(self, output, protocol=csb.io.Pickle.HIGHEST_PROTOCOL)
480 481 @staticmethod
482 - def deserialize(file):
483 """ 484 Load I-sites from a serialization file. 485 486 @param file: source file name 487 @type file: str 488 489 @return: the deserialized library object 490 @rtype: L{Library} 491 """ 492 493 with open(file, 'rb') as output: 494 library = csb.io.Pickle.load(output) 495 496 return library
497
498 -class Cluster(object):
499 """ 500 Object representation of an I-Sites library peptide fragment. 501 """ 502
503 - def __init__(self):
504 505 self.id = None 506 self.motiflen = None 507 self.profilelen = None 508 self.overhang = None 509 self.file = None 510 self.program = None 511 self.mda = None 512 self.dme = None 513 self.pseudocount = 0.2 514 self.linearfit = None 515 self.representative = None 516 self.covarweight = None 517 self.profile = ProteinProfile(ProteinProfile.BackgroundFreqs, alpha=self.pseudocount) 518 self.covariance = None
519
520 - def __lt__(self, other):
521 return self.id < other.id
522 523 @property
524 - def isite(self):
525 return self.id
526 527 @property
528 - def length(self):
529 return self.profile.length
530 531 @property
532 - def has_structure(self):
533 if self.representative.structure: 534 return True 535 return False
536
537 - def serialize(self, file):
538 """ 539 Serialize the cluster to a file. 540 541 @param file: output file name 542 @type file: str 543 """ 544 545 with open(file, 'wb') as output: 546 csb.io.Pickle.dump(self, output, protocol=csb.io.Pickle.HIGHEST_PROTOCOL)
547 548 @staticmethod
549 - def deserialize(file):
550 """ 551 Load a cluster from a serialized object. 552 553 @param file: source file name 554 @type file: str 555 556 @return: deserialized fragment object 557 @rtype: L{Cluster} 558 """ 559 560 with open(file, 'rb') as output: 561 cluster = csb.io.Pickle.load(output) 562 563 return cluster
564
565 - def attach_structure(self, pdb_file):
566 """ 567 Parse the paradigm's pdb_file and attach structural data to 568 C{cluster.representative} (C{source} and C{structure} attributes). 569 570 @param pdb_file: source PDB file containing the structure of the paradigm 571 @type pdb_file: str 572 573 @raise InternalISitesError: on parse and/or structure assignment errors 574 @raise InvalidParadigmError: when the torsion angles in the cluster do not 575 match the angles computed from the PDB file 576 """ 577 from csb.bio.io import StructureParser 578 rep = self.representative 579 s = StructureParser(pdb_file).parse_structure() 580 581 if not rep.chain: 582 if s.chains.length > 1: 583 raise InternalISitesError('Cluster {0} does not define any chain, but multiple choices exist.'.format(self.id)) 584 else: 585 rep.chain = s.chains.keys()[0] 586 if rep.chain not in s.chains: 587 raise InternalISitesError('Structure {0} does not contain chain {1}'.format(rep.accession, rep.chain)) 588 589 s.chains[rep.chain].compute_torsion() 590 start_residue = s.chains[rep.chain].find(rep.start) 591 592 current_rank = start_residue.rank 593 for i, rep_angles in enumerate(rep.angles, start=rep.angles.start_index): 594 595 current_residue = s.chains[rep.chain].residues[current_rank] 596 torsion_delta = 0 597 598 if current_residue.torsion.phi: 599 torsion_delta += abs(current_residue.torsion.phi - rep_angles.phi) 600 if current_residue.torsion.psi: 601 torsion_delta += abs(current_residue.torsion.psi - rep_angles.psi) 602 603 assert (i + rep.start - 1) == int(current_residue.id) 604 605 if torsion_delta > 1: 606 raise InvalidParadigmError('delta_phi + delta_psi for ' 607 '{3.accession} {3.chain}[{4}] is {0}: phi: {1.phi} vs {2.phi}, psi: {1.psi} vs {2.psi}'.format( 608 torsion_delta, current_residue.torsion, rep_angles, rep, current_rank)) 609 current_rank += 1 610 611 start = start_residue.rank - self.overhang 612 stop = start + self.profile.length - 1 613 614 if start < s.chains[rep.chain].residues.start_index or stop > s.chains[rep.chain].residues.last_index: 615 raise InternalISitesError('With the ovehangs included, fragment ' 616 '{0} exceeds the boundaries of the chain: {1}..{2} vs {3.start_index}..{3.last_index}'.format( 617 self.id, start, stop, s.chains[rep.chain].residues)) 618 619 rep.structure = s.chains[rep.chain].subregion(start, stop) 620 621 assert rep.structure.length == self.profile.length 622 623 backbone = set(['N', 'CA', 'C']) 624 for residue in rep.structure.residues: 625 if not (residue.has_structure and set(residue.atoms).issuperset(backbone)): 626 rep.structure = None 627 raise InternalISitesError('Fragment {0} is built on discontinuous structure: {1} {4} at {2}-{3}'.format( 628 self.id, rep.accession, start, stop, residue)) 629 630 rep.source = ChainSequence() 631 rep.source.sequence = s.chains[rep.chain].sequence 632 rep.source.accession = s.accession 633 rep.source.id = rep.chain 634 rep.source.type = s.chains[rep.chain].type 635 rep.source.torsion = s.chains[rep.chain].torsion
636
637 - def sequence_similarity(self, sequence, start=0):
638 """ 639 Compute the log-odds score of C{sequence} matching C{self}'s profile. 640 641 @param sequence: subject sequence 642 @type sequence: str 643 @param start: do the comparison starting with this offset in the sequence 644 @type start: int 645 646 @return: log-odds score 647 @rtype: float 648 """ 649 score = 0 650 window = self.profile.length 651 652 sequence = sequence[start : start + window] 653 assert len(sequence) == self.profile.length 654 655 for column, aa in enumerate(sequence): 656 aa = aa.upper() 657 if aa in self.profile.pssm[column]: 658 score += self.profile.pssm[column][aa] 659 else: 660 raise ValueError('Unknown residue {0}'.format(aa)) 661 662 return score
663
664 - def profile_similarity(self, profile, start=0):
665 """ 666 Compute the similarity score of C{profile} matching C{self}'s profile (see 667 Bystroff 2001, 2008). 668 669 @param profile: subject profile, built with alpha=0.5 670 @type profile: L{ProteinProfile} 671 @param start: do the comparison starting with this offset in the subject 672 @type start: int 673 674 @return: similarity score calculated with Bystroff's metric 675 @rtype: float 676 677 @raise ValueError: on mismatch in the amino acid content at a given column 678 in both profiles; or when the subject has not been created 679 with alpha=0.5 680 """ 681 if profile.alpha != 0.5: 682 raise ValueError('The target profile must be built with alpha=0.5') 683 684 score = 0 685 window = self.profile.length 686 687 for column in range(0, window): 688 689 if set(self.profile.pssm[column]) != set(profile.pssm[column + start]): 690 raise ValueError('Both profiles must contain identical amino acids at a given column') 691 692 for aa in self.profile.pssm[column]: 693 score += self.profile.pssm[column][aa] * profile.pssm[column + start][aa] 694 695 return score
696
697 - def cross_entropy(self, profile, start=0):
698 """ 699 Compute the similarity score of C{profile} matching C{self}'s profile using 700 the cross entropy method. 701 702 @param profile: subject profile 703 @type profile: L{ProteinProfile} 704 @param start: do the comparison starting with this offset in the subject 705 @type start: int 706 707 @return: the cross entropy between the profiles 708 @rtype: float 709 """ 710 score = 0 711 window = self.profile.length 712 713 for column in range(0, window): 714 for aa in self.profile.pssm[column]: 715 score += self.profile.pssm[column][aa] * math.exp( profile.pssm[column + start][aa] ) 716 717 return score
718
719 - def sum_of_odds(self, profile, start=0):
720 """ 721 Compute the similarity score of C{profile} matching C{self}'s profile using 722 the log-sum-of-odds method (Soeding). 723 724 @param profile: subject profile 725 @type profile: L{ProteinProfile} 726 @param start: do the comparison starting with this offset in the subject 727 @type start: int 728 729 @return: the log-sum-of-odds similarity between the profiles 730 @rtype: float 731 """ 732 score = 1 733 window = self.profile.length 734 735 for column in range(0, window): 736 737 dot_product = 0 738 739 for aa in self.profile.pssm[column]: 740 q_emission = self.profile.matrix[column][aa] 741 s_emission = profile.matrix[column + start][aa] 742 dot_product += ((q_emission * s_emission) / self.profile.background[aa]) 743 744 score *= dot_product 745 746 return math.log(score or sys.float_info.min)
747
748 - def city_block(self, profile, start=0):
749 """ 750 Compute the similarity score of C{profile} matching C{self}'s profile using 751 the exponential city block metric (see Baker 1995). 752 753 @param profile: subject profile 754 @type profile: L{ProteinProfile} 755 @param start: do the comparison starting with this offset in the subject 756 @type start: int 757 758 @return: the city block similarity between the profiles 759 @rtype: float 760 """ 761 762 score = 0 763 window = self.profile.length 764 765 for column in range(0, window): 766 for aa in self.profile.pssm[column]: 767 diff = math.exp( -self.profile.matrix[column][aa] ) - math.exp( -profile.matrix[column + start][aa] ) 768 score += abs(diff) 769 770 return score
771
773 """ 774 Return a callable fragment comparer that implements the specified 775 comparison mode. 776 777 @param mode: specifies the desired comparison method to be implemented 778 by the callable - a member of the L{FragmentMatching} enum 779 @type mode: L{csb.core.EnumItem} 780 781 @return: the proper comparison function which implements fragment comparison 782 with the C{mode} method 783 @rtype: function 784 785 @raise ValueError: on invalid C{mode} specified 786 """ 787 if mode == FragmentMatching.Bystroff: 788 return self.profile_similarity 789 elif mode == FragmentMatching.CrossEntropy: 790 return self.cross_entropy 791 elif mode == FragmentMatching.Soeding: 792 return self.sum_of_odds 793 elif mode == FragmentMatching.Sequence: 794 return self.sequence_similarity 795 elif mode == FragmentMatching.Baker: 796 return self.city_block 797 else: 798 raise ValueError(mode)
799
800 - def slide_over(self, target, mode=FragmentMatching.Bystroff):
801 """ 802 Find profile and structural matches by sliding the fragment over a specified 803 C{target} chain. 804 805 @param target: subject chain to be scanned to fragment instances 806 @type target: L{structure.Chain} 807 @param mode: fragment instance searching mode - a L{FragmentMatching} member 808 @type mode: L{csb.core.EnumItem} 809 810 @return: a list of L{ProfileMatch}es 811 @rtype: list 812 """ 813 814 compare = self.fragment_comparer(mode) 815 816 if hasattr(target, 'entry_id'): 817 entry_id = target.entry_id 818 else: 819 entry_id = target.id 820 821 window = self.profile.length 822 rmsd_window = window - 2 * self.overhang 823 matches = [ ] 824 query_residues = structure.Chain(self.representative.chain, 825 residues=self.representative.structure.residues[self.overhang + 1: -self.overhang]) 826 target_sequence = target.sequence # caching these values for efficiency 827 828 for i in range(0, len(target_sequence) - window + 1): 829 830 start = i + 1 831 rmsd_start = start + self.overhang 832 score, tm, tm_out, rmsd_ca, rmsd_dih = 0, None, None, None, None 833 834 if mode == FragmentMatching.Sequence: 835 score = compare(target_sequence, start=i) 836 else: 837 score = compare(target.profile, start=i) 838 839 subject_residues = structure.Chain(target.id, residues=target.residues[rmsd_start : rmsd_start + rmsd_window]) 840 841 try: 842 rmsd_ca = query_residues.rmsd(subject_residues) 843 except structure.Broken3DStructureError: 844 rmsd_ca = None 845 pass 846 847 if score is not None and (rmsd_ca is not None or rmsd_dih is not None): 848 match = ProfileMatch(entry_id, start, score, rmsd_ca, rmsd_dih, tm, tm_out) 849 matches.append(match) 850 851 return matches
852
853 - def slide_over_fast(self, target, mode=FragmentMatching.Bystroff):
854 """ 855 Find profile matches only (do not compute RMSD) by sliding the fragment 856 over a specified C{target} chain. 857 858 @param target: subject chain to be scanned to fragment instances 859 @type target: L{structure.Chain} 860 @param mode: fragment instance searching mode - a L{FragmentMatching} member 861 @type mode: L{csb.core.EnumItem} 862 863 @return: a list of L{ProfileMatch}es 864 @rtype: list 865 """ 866 if hasattr(target, 'entry_id'): 867 entry_id = target.entry_id 868 else: 869 entry_id = target.id 870 871 compare = self.fragment_comparer(mode) 872 873 window = self.profile.length 874 matches = [ ] 875 876 for i in range(0, target.profile.length - window + 1): 877 878 start = i + 1 879 score = None 880 881 if mode == FragmentMatching.Sequence: 882 score = compare(target.sequence, start=i) 883 else: 884 score = compare(target.profile, start=i) 885 886 if score is not None: 887 match = ProfileMatch(entry_id, start, score) 888 matches.append(match) 889 890 return matches
891
892 - def to_hmm(self):
893 """ 894 Convert this fragment into a profile HMM segment. Each column in the 895 fragment's profile becomes a Match state with equivalent emissions. 896 However, all Match states are connected with a fixed transition 897 probability of 100%. 898 899 @return: an HMM wrapper around the cluster's profile 900 @rtype: L{hmm.ProfileHMM} 901 """ 902 903 factory = hmm.StateFactory() 904 905 p = hmm.ProfileHMM(hmm.ScoreUnits.Probability) 906 p.family = 'is' + str(self.id) 907 p.name = 'is' + str(self.id) 908 p.id = str(self.id) 909 p.pseudocounts = False 910 p.version = '1.6' 911 912 background = dict([ (aa, self.profile.background[aa]) 913 for aa in 'ACDEFGHIKLMNPQRSTVWY' ]) 914 915 for i, row in enumerate(self.profile.matrix, start=1): 916 917 row = dict([ (aa, row[aa]) for aa in 'ACDEFGHIKLMNPQRSTVWY' ]) 918 residue = self.representative.structure.residues[i] 919 920 match = factory.create_match(row, background) 921 match.rank = i 922 923 layer = hmm.HMMLayer(i, residue) 924 layer.append(match) 925 926 p.layers.append(layer) 927 928 for layer in p.layers: 929 930 match = layer[hmm.States.Match] 931 932 if layer.rank < p.layers.last_index: 933 next_match = p.layers[layer.rank + 1][hmm.States.Match] 934 tran = hmm.Transition(match, next_match, 1.0) 935 match.transitions.append(tran) 936 937 last_tran = hmm.Transition(p.layers[-1][hmm.States.Match], p.end, 1.0) 938 p.layers[-1][hmm.States.Match].transitions.append(last_tran) 939 940 start_tran = hmm.Transition(p.start, p.layers[1][hmm.States.Match], 1.0) 941 p.start.transitions.append(start_tran) 942 943 p.length.matches = p.layers.length 944 p.length.layers = p.layers.length 945 p.effective_matches = 10 946 947 rep_sequence = ''.join([ str(l.residue.type) for l in p.layers ]) 948 seq = csb.bio.sequence.Sequence('dummy', '>dummy', rep_sequence, 949 type=csb.bio.sequence.SequenceTypes.Protein) 950 p.alignment = csb.bio.sequence.A3MAlignment([seq]) 951 952 return p
953 954 @staticmethod
955 - def from_hmm(source, id, start=None, end=None, rep_accession=None, 956 rep_chain=None, alpha=0.2):
957 """ 958 Build a fragment from a Profile HMM. 959 960 @param source: the HMM to convert 961 @type source: L{hmm.ProfileHMM}, L{hmm.ProfileHMMSegment} 962 @param id: ID of the new fragment 963 @type id: int 964 @param start: start position (optional) 965 @type start: int 966 @param end: end position (optional) 967 @type end: int 968 @param rep_accession: accession number for I{cluster.representative} 969 @type rep_accession: str 970 @param rep_chain: chain ID for I{cluster.representative} 971 @type rep_chain: str 972 @param alpha: alpha pseudocount for I{cluster.profile.pssm} 973 @type alpha: float 974 975 @return: an I-Sites cluster wrapper 976 @rtype: L{Cluster} 977 978 @raise TypeError: when the HMM's structural data is missing or 979 interrupted 980 """ 981 982 if not source.has_structure: 983 raise TypeError('This HMM has no structural data assigned and ' 984 'cannot be converted.') 985 986 if start is None: 987 start = source.layers.start_index 988 if end is None: 989 end = source.layers.last_index 990 991 score_units = source.score_units 992 source.convert_scores(hmm.ScoreUnits.Probability) 993 994 background = dict(ProteinProfile.BackgroundFreqs) 995 for aa in source.layers[1][hmm.States.Match].emission: 996 background[str(aa)] = source.layers[1][hmm.States.Match].background[aa] 997 998 cluster = Cluster() 999 cluster.id = id 1000 cluster.overhang = 0 1001 cluster.profile = ProteinProfile(background, alpha=alpha) 1002 1003 residues = [] 1004 for layer in source.layers: 1005 1006 residues.append(csb.core.deepcopy(layer.residue)) 1007 1008 if start <= layer.rank <= end: 1009 if not layer.residue.has_structure: 1010 raise TypeError( 1011 "The HMM's structure is interrupted at layer {0}.".format( 1012 layer.rank)) 1013 1014 emission = {} 1015 1016 for aa in layer[hmm.States.Match].emission: 1017 emission[str(aa)] = layer[hmm.States.Match].emission[aa] 1018 1019 cluster.profile.add_column(**dict(emission)) 1020 1021 cluster.profilelen = cluster.profile.length 1022 cluster.motiflen = cluster.profile.length - 2 * cluster.overhang 1023 1024 if rep_accession is None: 1025 rep_accession = source.id[1:5].lower() 1026 if rep_chain is None: 1027 rep_chain = source.id[5].upper() 1028 1029 chain = structure.Chain(rep_chain, structure.SequenceTypes.Protein, 1030 None, residues, rep_accession) 1031 chain.compute_torsion() 1032 1033 src = ChainSequence() 1034 src.sequence = chain.sequence 1035 src.accession = chain.accession 1036 src.id = chain.id 1037 src.type = chain.type 1038 src.torsion = chain.torsion 1039 1040 cluster.representative = RepStructureFragment(chain.accession, 1041 chain.id, start) 1042 cluster.representative.source = src 1043 cluster.representative.structure = chain.subregion(start, end) 1044 cluster.representative.angles = cluster.representative.structure.torsion 1045 1046 assert cluster.representative.angles.length == cluster.motiflen 1047 assert cluster.profile.length == cluster.representative.structure.length 1048 assert cluster.representative.angles.length == (cluster.profile.length - 1049 2 * cluster.overhang) 1050 1051 source.convert_scores(score_units) 1052 1053 return cluster
1054
1055 -class RepStructureFragment(object):
1056 """ 1057 Container class which describes an I-Sites paradigm structure for a cluster. 1058 1059 @param accession: paradigm's accession number 1060 @type accession: str 1061 @param chain: chain ID 1062 @type chain: str 1063 @param start: the start position in the paradigm's chain where the fragment 1064 instance is located (including the overhangs). In the original 1065 library this is a PDB sequence number, not a SEQRES rank. The 1066 residue rank can be retrieved from the structure instead: 1067 1068 >>> cluster.representative.structure.find(start).rank 1069 residue rank 1070 1071 @type start: int 1072 @param angles: torsion angles of the paradigm, in degrees 1073 @type angles: L{structure.TorsionAnglesCollection} 1074 @param source: the entire paradigm chain with pre-computed torsion angles 1075 @type source: L{ChainSequence} 1076 @param struct: a segment of the paradigm's L{structure.Chain} which 1077 contains the I-Site. Residue ranks and IDs are preserved 1078 as they were in the original chain (no shifting) 1079 @type struct: L{structure.Chain} 1080 """ 1081
1082 - def __init__(self, accession, chain, start, angles=None, source=None, struct=None):
1083 1084 self.accession = accession 1085 self.chain = chain 1086 self.start = start 1087 self._source = None 1088 self._angles = structure.TorsionAnglesCollection() 1089 self._structure = None 1090 1091 1092 if angles is not None: 1093 self.angles = angles 1094 if source is not None: 1095 self.source = source 1096 if struct is not None: 1097 self.structure = struct
1098 1099 @property
1100 - def angles(self):
1101 return self._angles
1102 @angles.setter
1103 - def angles(self, angles):
1104 self._angles.update(angles)
1105 1106 @property
1107 - def source(self):
1108 return self._source
1109 @source.setter
1110 - def source(self, chain):
1111 if type(chain) not in (structure.Chain, ChainSequence): 1112 raise TypeError("The source property must be a Chain or ChainSequence instance with torsion property pre-calculated.") 1113 self._source = chain
1114 1115 @property
1116 - def structure(self):
1117 return self._structure
1118 @structure.setter
1119 - def structure(self, chain_fragment):
1120 if not isinstance(chain_fragment, structure.Chain): 1121 raise TypeError("The structure property must be a Chain instance.") 1122 self._structure = chain_fragment
1123