Package csb :: Package bio :: Package fragments
[frames] | no frames]

Source Code for Package csb.bio.fragments

   1  """ 
   2  APIs for working with protein structure fragments and libraries. 
   3   
   4  This package contains the nuts and bolts of HHfrag. Everything here revolves 
   5  around the L{Target} class, which describes a protein structure prediction 
   6  target. One typically assigns fragments (L{Assignment}s) to the target and then 
   7  builds a fragment library with L{RosettaFragsetFactory}. 
   8   
   9  @note: Internal or legacy objects are intentionally left undocumented. 
  10         This typically indicates experimental code. 
  11  """ 
  12   
  13  import os 
  14  import numpy 
  15   
  16  import csb.io 
  17  import csb.core 
  18  import csb.bio.utils 
  19  import csb.bio.structure 
  20  import csb.bio.sequence 
  21   
  22  from csb.bio.structure import SecondaryStructure 
23 24 25 -class FragmentTypes(object):
26 27 ISites = 'IS' 28 HMMFragments = 'HH' 29 HHThread = 'TH' 30 HHfrag = HHThread 31 Rosetta = 'NN'
32
33 -class Metrics(object):
34 35 RMSD = 'rmsd_to' 36 NORMALIZED_RMSD = 'nrmsd_to' 37 MDA = 'mda_to'
38 39 RANDOM_RMSD = { 5: 1.8749005857255376, 6: 2.4314283686276261, 7: 2.9021135267789608, 8: 3.2477716200172715, 9: 3.5469606556031708, 10: 3.8295465524456329, 40 11: 4.1343107114131783, 12: 4.3761697929053014, 13: 4.6707299668248394, 14: 4.9379016881069733, 15: 5.1809028645084911, 16: 5.4146957142595662, 41 17: 5.7135948448156988, 18: 5.9597935432566782, 19: 6.1337340535741962, 20: 6.3962825155503271, 21: 6.6107937773415166, 22: 6.8099096274123401, 42 23: 7.0435583846849639, 24: 7.2160956482560970, 25: 7.4547896324594962, 26: 7.6431870072434211, 27: 7.8727812194173836, 28: 8.0727393298443637, 43 29: 8.2551450998965326, 30: 8.4413583511786587, 31: 8.5958719774122052, 32: 8.7730435506242408, 33: 8.9970648837941649, 34: 9.1566521405105163, 44 35: 9.2828620878454728, 36: 9.4525824357923405, 37: 9.6322126445253300, 38: 9.7851684750961176, 39: 9.9891454649821476, 40: 10.124373939352028, 45 41: 10.284348528344765, 42: 10.390457305096271, 43: 10.565792044674239, 44: 10.676532740033737, 45: 10.789537132283652, 46: 11.004475543757550, 46 47: 11.064541647783571, 48: 11.231219875286985, 49: 11.319222637391441, 50: 11.485478165340824, 51: 11.607522494435521, 52: 11.700268836069840, 47 53: 11.831245255954073, 54: 11.918975893263905 }
48 49 -class FragmentMatch(object):
50 """ 51 Base class, representing a match between a fragment and its target. 52 """ 53
54 - def __init__(self, id, qstart, qend, probability, rmsd, tm_score, qlength):
55 56 self._id = id 57 self._qstart = qstart 58 self._qend = qend 59 self._probability = probability 60 self._rmsd = rmsd 61 self._tm_score = tm_score 62 self._qlength = qlength
63 64 @property
65 - def id(self):
66 return self._id
67 68 @property
69 - def qstart(self):
70 return self._qstart
71 72 @property
73 - def qend(self):
74 return self._qend
75 76 @property
77 - def qlength(self):
78 return self._qlength
79 80 @property
81 - def rmsd(self):
82 return self._rmsd
83 84 @property
85 - def tm_score(self):
86 return self._tm_score
87 88 @property
89 - def probability(self):
90 return self._probability
91 92 @property
93 - def length(self):
94 return self.qend - self.qstart + 1
95 96 @property
97 - def source_id(self):
98 raise NotImplementedError()
99 100 @property
101 - def start(self):
102 raise NotImplementedError()
103 104 @property
105 - def end(self):
106 raise NotImplementedError()
107
108 -class PredictionContainer(object):
109
110 - def __init__(self, target, isites_prediction, hmm_prediction, combined_prediction):
111 112 self.target = target 113 114 self.isites = isites_prediction 115 self.hmm = hmm_prediction 116 self.combined = combined_prediction
117
118 -class Prediction(object):
119
120 - def __init__(self, alignment, coordinates):
121 122 self.alignment = alignment 123 self.coordinates = coordinates
124
125 -class TorsionAnglesPredictor(object):
126 """ 127 Fragment-based phi/psi angles predictor. 128 129 @param target: target protein, containing fragment assignments 130 @type target: L{Target} 131 @param threshold: RMSD distance threshold for L{FragmentCluster}-based filtering 132 @type threshold: float 133 @param extend: pick alternative, longer cluster reps, if possible 134 @type extend: bool 135 @param init: populate all L{FragmentCluster}s on instantiation. If False, this step 136 will be performed on demand (the first time C{predictor.compute()} is invoked) 137 138 @note: if C{init} is False, the first call to C{predictor.compute()} might take a long 139 time. Subsequent calls will be very fast. 140 """ 141
142 - def __init__(self, target, threshold=1.5, extend=False, init=False):
143 144 if not isinstance(target, Target): 145 raise TypeError(target) 146 if target.matches.length == 0: 147 raise ValueError('This target has no fragment assignments') 148 149 self._target = target 150 self._threshold = float(threshold) 151 self._extend = bool(extend) 152 153 self._initialized = False 154 self._reps = {} 155 self._clusters = {} 156 157 if init: 158 self.init()
159 160 @property
161 - def target(self):
162 return self._target
163 164 @property
165 - def threshold(self):
166 return self._threshold
167 168 @property
169 - def extend(self):
170 return self._extend
171
172 - def init(self):
173 """ 174 Compute and cache all L{FragmentCluster}s. 175 """ 176 177 self._reps = {} 178 self._clusters = {} 179 180 for residue in self.target.residues: 181 cluster = self._filter(residue) 182 183 if cluster is not None: 184 rep = cluster.centroid() 185 if rep.has_alternative: 186 rep.exchange() 187 188 self._reps[residue.native.rank] = rep 189 self._clusters[residue.native.rank] = cluster.items 190 191 self._initialized = True
192
193 - def _filter(self, residue):
194 195 try: 196 nodes = [] 197 for ai in residue.assignments: 198 node = ClusterNode.create(ai.fragment) 199 nodes.append(node) 200 201 cluster = FragmentCluster(nodes, threshold=self.threshold) 202 cluster.shrink(minitems=0) 203 204 return cluster 205 206 except (ClusterExhaustedError, ClusterDivergingError): 207 return None
208
209 - def _residue(self, rank):
210 211 for r in self._target.residues: 212 if r.native.rank == rank: 213 return r 214 215 raise ValueError('Rank {0} is out of range'.format(rank))
216
217 - def compute_single(self, rank):
218 """ 219 Extract torsion angles from the L{ClusterRep} at residue C{#rank}. 220 221 @param rank: target residue rank 222 @type rank: int 223 224 @rtype: L{TorsionPredictionInfo} 225 """ 226 227 residue = self._residue(rank) 228 rep = residue.filter(threshold=self.threshold, extend=self.extend) 229 230 if rep is None: 231 return None 232 233 else: 234 fragment = rep.centroid 235 torsion = fragment.torsion_at(rank, rank)[0] 236 ss = fragment.sec_structure_at(rank, rank)[0] 237 238 return TorsionPredictionInfo(rank, rep.confidence, torsion, ss, primary=True)
239
240 - def compute(self, rank):
241 """ 242 Extract torsion angles from all L{ClusterRep}s, covering residue C{#rank}. 243 244 @param rank: target residue rank 245 @type rank: int 246 247 @return: L{TorsionPredictionInfo} instances, sorted by confidence 248 @rtype: tuple of L{TorsionPredictionInfo} 249 """ 250 251 if not self._initialized: 252 self.init() 253 254 prediction = [] 255 256 for rep in self._reps.values(): 257 258 if rep.centroid.qstart <= rank <= rep.centroid.qend: 259 260 fragment = rep.centroid 261 torsion = fragment.torsion_at(rank, rank)[0] 262 ss = fragment.sec_structure_at(rank, rank)[0] 263 info = TorsionPredictionInfo(rank, rep.confidence, torsion, ss) 264 265 if rep is self._reps.get(rank, None): 266 info.primary = True 267 268 prediction.append(info) 269 270 prediction.sort(reverse=True) 271 return tuple(prediction)
272
273 - def flat_torsion_map(self):
274 """ 275 Filter the current fragment map and create a new, completely flat, 276 non-overlapping map built from centroids, assigned iteratively by 277 decreasing confidence. Centroids with lower confidence which overlap 278 with previously assigned centroids will be trimmed to fill existing 279 gaps only. 280 281 @return: L{TorsionPredictionInfo} instances, one for each target residue 282 @rtype: tuple of L{TorsionPredictionInfo} 283 """ 284 285 if not self._initialized: 286 self.init() 287 288 prediction = [] 289 slots = set(range(1, self.target.length + 1)) 290 291 reps = list(self._reps.values()) 292 reps.sort(key=lambda i: i.confidence, reverse=True) 293 294 for rep in reps: 295 296 for rank in range(rep.centroid.qstart, rep.centroid.qend + 1): 297 if rank in slots: 298 torsion = rep.centroid.torsion_at(rank, rank)[0] 299 ss = rep.centroid.sec_structure_at(rank, rank)[0] 300 info = TorsionPredictionInfo(rank, rep.confidence, torsion, ss, primary=True) 301 302 prediction.append(info) 303 slots.remove(rank) 304 305 for rank in slots: 306 prediction.append(TorsionPredictionInfo(rank, 0, None)) 307 308 prediction.sort(key=lambda i: i.rank) 309 return tuple(prediction)
310
311 - def get_angles(self, rank):
312 """ 313 Extract all torsion angles coming from all fragments, which had survived 314 the filtering and cover residue C{#rank}. 315 316 @param rank: target residue rank 317 @type rank: int 318 319 @return: all L{TorsionAngles} for a cluster at the specified residue 320 @rtype: tuple of L{TorsionAngles} 321 """ 322 323 if not self._initialized: 324 self.init() 325 if rank not in self._clusters: 326 return tuple() 327 328 angles = [] 329 330 for node in self._clusters[rank]: 331 fragment = node.fragment 332 torsion = fragment.torsion_at(rank, rank)[0] 333 angles.append(torsion) 334 335 return tuple(angles)
336
337 338 -class TorsionPredictionInfo(object):
339 """ 340 Struct container for a single torsion angle prediction. 341 342 @param rank: target residue rank 343 @type rank: int 344 @param confidence: confidence of prediction 345 @type confidence: float 346 @param torsion: assigned phi/psi/omega angles 347 @type torsion: L{TorsionAngles} 348 @param dssp: assigned secondary structure 349 @type dssp: L{SecondaryStructureElement} 350 @param primary: if True, designates that the assigned angles are extracted 351 from the L{ClusterRep} at residue C{#rank}; otherwise: the 352 angles are coming from another, overlapping L{ClusterRep} 353 354 """ 355
356 - def __init__(self, rank, confidence, torsion, dssp, primary=False):
357 358 self.rank = rank 359 self.confidence = confidence 360 self.torsion = torsion 361 self.primary = primary 362 self.dssp = dssp
363
364 - def as_tuple(self):
365 """ 366 @return: convert this prediction to a tuple: (confidence, phi, psi, omega) 367 @rtype: tuple 368 """ 369 return tuple([self.confidence, self.torsion.phi, self.torsion.psi, self.torsion.omega])
370
371 - def __str__(self):
372 return '<TorsionPredictionInfo: {0.confidence:6.3f} at #{0.rank}>'.format(self)
373
374 - def __lt__(self, other):
375 return self.confidence < other.confidence
376
377 378 -class AssignmentFactory(object):
379
380 - def target(self, *a, **k):
381 return Target(*a, **k)
382
383 - def residue(self, *a, **k):
384 return TargetResidue(*a, **k)
385
386 - def assignment(self, *a, **k):
387 return Assignment(*a, **k)
388
389 -class ChemShiftAssignmentFactory(object):
390
391 - def target(self, *a, **k):
392 return ChemShiftTarget(*a, **k)
393
394 - def residue(self, *a, **k):
395 return ChemShiftTargetResidue(*a, **k)
396
397 - def assignment(self, *a, **k):
398 return ChemShiftAssignment(*a, **k)
399
400 -class Target(csb.core.AbstractNIContainer):
401 """ 402 Represents a protein structure prediction target. 403 404 @param id: target sequence ID, in PDB accnC format 405 @type id: str 406 @param length: total target sequence length 407 @type length: int 408 @param residues: a list, containing target's residues. See also 409 L{Target.from_sequence} 410 @type residues: iterable of L{csb.bio.structure.ProteinResidue}s 411 """ 412
413 - def __init__(self, id, length, residues, overlap=None, segments=None, factory=AssignmentFactory()):
414 415 self._id = id 416 self._accession = id[:-1] 417 self._chain_id = id[-1] 418 self._length = length 419 self._overlap = overlap 420 self._factory = factory 421 422 self._assignments = csb.core.ReadOnlyCollectionContainer(type=Assignment) 423 self._errors = csb.core.CollectionContainer() 424 425 resi = [factory.residue(native) for native in residues] 426 self._residues = csb.core.ReadOnlyCollectionContainer(items=resi, 427 type=TargetResidue, start_index=1) 428 429 if segments is not None: 430 segments = dict([(s.start, s) for s in segments]) 431 self._segments = csb.core.ReadOnlyDictionaryContainer(items=segments)
432 433 @staticmethod
434 - def from_sequence(id, sequence):
435 """ 436 Factory, which builds L{Target} objects from a bare sequence. 437 438 @param sequence: target's sequence 439 @type sequence: L{csb.bio.sequence.AbstractSequence}, str or iterable 440 441 @rtype: L{Target} 442 """ 443 444 if isinstance(sequence, csb.bio.sequence.Sequence): 445 sequence = sequence.sequence 446 447 residues = [] 448 449 for rn, aa in enumerate(sequence, start=1): 450 residue = csb.bio.structure.ProteinResidue(rank=rn, type=aa) 451 residues.append(residue) 452 453 return Target(id, len(residues), residues)
454 455 @staticmethod
456 - def from_profile(hmm):
457 """ 458 Factory, which builds L{Target} objects from an HMM profile. 459 460 @param hmm: target's HMM 461 @type hmm: L{csb.bio.hmm.ProfileHMM} 462 463 @rtype: L{Target} 464 """ 465 466 residues = [ r.clone() for r in hmm.residues ] 467 return Target(hmm.id, hmm.layers.length, residues)
468 469 @staticmethod
470 - def deserialize(pickle):
471 472 with open(pickle) as stream: 473 return csb.io.Pickle.load(stream)
474 475 @property
476 - def _children(self):
477 return self._residues
478 479 @property
480 - def errors(self):
481 return self._errors
482 483 @property
484 - def id(self):
485 return self._id
486 487 @property
488 - def accession(self):
489 return self._accession
490 491 @property
492 - def chain_id(self):
493 return self._chain_id
494 495 @property
496 - def max_overlap(self):
497 return self._overlap
498 499 @property
500 - def length(self):
501 return self._length
502 503 @property
504 - def sequence(self):
505 return ''.join(str(r.native.type) for r in self)
506 507 @property
508 - def matches(self):
509 return self._assignments
510 511 @property
512 - def residues(self):
513 return self._residues
514 515 @property
516 - def segments(self):
517 return self._segments
518
519 - def assign(self, fragment):
520 """ 521 Add a new fragment match. 522 @param fragment: fragment to assign 523 @type fragment: L{Assignment} 524 """ 525 526 if not 1 <= fragment.qstart <= fragment.qend <= len(self._residues): 527 raise ValueError("Fragment out of range") 528 529 self._assignments._append_item(fragment) 530 531 for rank in range(fragment.qstart, fragment.qend + 1): 532 ai = ResidueAssignmentInfo(fragment, rank) 533 self._residues[rank].assign(ai) 534 535 if fragment.segment is not None: 536 try: 537 self._segments[fragment.segment].assign(fragment) 538 except KeyError: 539 raise ValueError("Undefined segment starting at {0}".format(fragment.segment))
540
541 - def assignall(self, fragments):
542 """ 543 Assign a bunch of fragments at once. 544 @type fragments: iterable of L{Assignment}s 545 """ 546 for frag in fragments: 547 self.assign(frag)
548
549 - def filter(self, threshold=1.5, extend=False):
550 """ 551 Filter the current fragment map using a L{FragmentCluster}. 552 553 @param threshold: cluster RMSD threshold (see L{FragmentCluster}) 554 @type threshold: float 555 @param extend: pick extended alternatives where possible (default=False) 556 @type extend: bool 557 558 @return: a new target, containing only cluster centroids/reps 559 @rtype: L{Target} 560 """ 561 562 target = self.clone() 563 564 for residue in self.residues: 565 rep = residue.filter(threshold=threshold, extend=extend) 566 567 if rep is not None: 568 target.assign(rep.centroid) 569 570 return target
571
572 - def clone(self):
573 """ 574 @return: a deep copy of the target 575 @rtype: L{Target} 576 """ 577 578 segments = [self.segments[start] for start in self.segments] 579 segments = [TargetSegment(s.start, s.end, s.count) for s in segments] 580 581 target = self._factory.target(self.id, self.length, [r.native for r in self.residues], 582 overlap=self._overlap, segments=segments) 583 584 return target
585
586 -class ChemShiftTarget(Target):
587
588 - def __init__(self, id, length, residues, overlap=None):
592
593 - def assign(self, fragment):
594 595 if not 1 <= fragment.qstart <= fragment.qend <= len(self._residues): 596 raise ValueError("Fragment out of range") 597 598 self._assignments._append_item(fragment) 599 600 rank = fragment.qstart 601 ai = ResidueAssignmentInfo(fragment, rank) 602 self._residues[rank].assign(ai)
603
604 - def clone(self):
605 return self._factory.target(self.id, self.length, [r.native for r in self.residues], 606 overlap=self._overlap)
607
608 -class TargetResidue(object):
609 """ 610 Wrapper around L{Target}'s native residues. Decorates them with additional, 611 fragment-related methods. 612 613 @type native_residue: L{csb.bio.structure.ProteinResidue} 614 """ 615
616 - def __init__(self, native_residue):
617 618 self._type = native_residue.type 619 self._native = native_residue.clone() 620 self._assignments = csb.core.ReadOnlyCollectionContainer(type=ResidueAssignmentInfo)
621 622 @property
623 - def type(self):
624 return self._type
625 626 @property
627 - def native(self):
628 return self._native
629 630 @property
631 - def assignments(self):
632 return self._assignments
633
634 - def assign(self, assignment_info):
635 self._assignments._append_item(assignment_info)
636
637 - def verybest(self):
638 """ 639 @return: the fragment with the lowest RMSD at this position in the L{Target} 640 @rtype: L{Assignment} 641 """ 642 643 best = None 644 645 for ai in self.assignments: 646 a = ai.fragment 647 if a.length < FragmentCluster.MIN_LENGTH: 648 continue 649 if best is None or a.rmsd < best.rmsd: 650 best = a 651 elif a.rmsd == best.rmsd and a.length > best.length: 652 best = a 653 654 return best
655
656 - def filter(self, method=Metrics.RMSD, threshold=1.5, extend=False):
657 """ 658 Filter all fragments, covering this position in the L{Target} using a 659 L{FragmentCluster}. 660 661 @param method: one of the L{Metrics} members (default=L{Metrics.RMSD}) 662 @type method: str 663 @param threshold: cluster RMSD threshold (see L{FragmentCluster}) 664 @type threshold: float 665 @param extend: pick extended alternative where possible (default=False) 666 @type extend: bool 667 668 @return: cluster's representative (if converged) or None 669 @rtype: L{ClusterRep} or None 670 """ 671 672 try: 673 nodes = [] 674 for ai in self.assignments: 675 node = ClusterNode.create(ai.fragment, method, extend) 676 nodes.append(node) 677 678 cluster = FragmentCluster(nodes, threshold=threshold) 679 680 center = cluster.shrink(minitems=0) 681 if center.has_alternative: 682 center.exchange() 683 684 return center 685 686 except (ClusterExhaustedError, ClusterDivergingError): 687 return None
688
689 - def longest(self):
690 """ 691 @return: the longest fragment, covering the current position 692 @rtype: L{Assignment} 693 """ 694 best = None 695 696 for q in self.assignments: 697 if best is None or (q.fragment.length > best.length): 698 best = q.fragment 699 700 return best
701
702 - def precision(self, threshold=1.5):
703 """ 704 @return: the residue-wise precision of the fragment library at the 705 current position (percentage). 706 707 @param threshold: true-positive RMSD cutoff (default=1.5) 708 @type threshold: float 709 @rtype: float 710 """ 711 712 if self.assignments.length < 1: 713 return None 714 else: 715 positive = [a for a in self.assignments if a.fragment.rmsd <= threshold] 716 pos = len(positive) * 100.0 / self.assignments.length 717 718 return pos
719
720 -class ChemShiftTargetResidue(TargetResidue):
721
722 - def verybest(self):
723 724 best = None 725 726 for ai in self.assignments: 727 a = ai.fragment 728 729 if a.score < ChemShiftAssignment.BIT_SCORE_THRESHOLD * a.window: 730 continue 731 732 if best is None or a.score > best.score: 733 best = a 734 elif a.score == best.score and a.length > best.length: 735 best = a 736 737 return best
738
739 -class TargetSegment(object):
740
741 - def __init__(self, start, end, count):
742 743 self._start = start 744 self._end = end 745 self._count = count 746 747 self._assignments = csb.core.ReadOnlyCollectionContainer(type=Assignment)
748 749 @property
750 - def count(self):
751 return self._count
752 753 @property
754 - def start(self):
755 return self._start
756 757 @property
758 - def end(self):
759 return self._end
760 761 @property
762 - def length(self):
763 return (self._end - self._start + 1)
764 765 @property
766 - def assignments(self):
767 return self._assignments
768
769 - def assign(self, fragment):
770 if fragment.segment != self.start: 771 raise ValueError('Segment origin mismatch: {0} vs {1}'.format(fragment.segment, self.start)) 772 else: 773 self._assignments._append_item(fragment)
774
775 - def verybest(self):
776 777 best = None 778 779 for a in self.assignments: 780 if a.length < FragmentCluster.MIN_LENGTH: 781 continue 782 if best is None or a.rmsd < best.rmsd: 783 best = a 784 elif a.rmsd == best.rmsd and a.length > best.length: 785 best = a 786 787 return best
788
789 - def best(self, method=Metrics.RMSD):
790 791 try: 792 cluster = FragmentCluster(self.assignments, threshold=1.5, 793 connectedness=0.5, method=method) 794 centroid = cluster.shrink(minitems=1) 795 return centroid 796 797 except ClusterExhaustedError: 798 return None 799 finally: 800 del cluster
801
802 - def longest(self):
803 804 best = None 805 806 for q in self.assignments: 807 if best is None or (q.length > best.length): 808 best = q 809 810 return best
811
812 - def pairwise_rmsd(self, min_overlap=5):
813 814 rmsds = [] 815 816 for q in self.assignments: 817 for s in self.assignments: 818 if q is not s: 819 r = q.rmsd_to(s, min_overlap) 820 if r is not None: 821 rmsds.append(r) 822 else: 823 assert q.rmsd_to(s, 1) < 0.01 824 825 return rmsds
826
827 - def pairwise_mda(self, min_overlap=5):
828 829 mdas = [] 830 831 for q in self.assignments: 832 for s in self.assignments: 833 if q is not s: 834 m = q.mda_to(s, min_overlap) 835 if m is not None: 836 mdas.append(m) 837 return mdas
838
839 - def pairwise_sa_rmsd(self, profiles='.', min_overlap=5):
840 841 from csb.bio.hmm import RELATIVE_SA 842 from csb.bio.io.hhpred import ScoreUnits, HHProfileParser 843 844 def convert_sa(sa): 845 return numpy.array([ RELATIVE_SA[i] for i in sa ])
846 847 sources = {} 848 scores = [] 849 850 for q in self.assignments: 851 for s in self.assignments: 852 853 if s.source_id not in sources: 854 hmm = HHProfileParser(os.path.join(profiles, s.source_id + '.hhm')).parse() 855 sources[s.source_id] = hmm.dssp_solvent 856 857 if q is not s: 858 859 common = q.overlap(s) 860 if len(common) >= min_overlap: 861 862 qsa = q.solvent_at(sources[q.source_id], min(common), max(common)) 863 ssa = s.solvent_at(sources[s.source_id], min(common), max(common)) 864 865 if '-' in qsa + ssa: 866 continue 867 868 qsa = convert_sa(qsa) 869 ssa = convert_sa(ssa) 870 assert len(qsa) == len(ssa) 871 sa_rmsd = numpy.sqrt(numpy.sum((qsa - ssa) ** 2) / float(len(qsa))) 872 873 scores.append(sa_rmsd) 874 return scores
875
876 - def pairwise_scores(self, profiles='.', min_overlap=5):
877 878 from csb.bio.hmm import BACKGROUND 879 back = numpy.sqrt(numpy.array(BACKGROUND)) 880 881 sources = {} 882 scores = [] 883 884 for q in self.assignments: 885 for s in self.assignments: 886 887 if s.source_id not in sources: 888 # hmm = HHProfileParser(os.path.join(hmm_path, s.source_id + '.hhm')).parse(ScoreUnits.Probability) 889 sources[s.source_id] = csb.io.Pickle.load(open(os.path.join(profiles, s.source_id + '.pkl'), 'rb')) 890 891 if q is not s: 892 893 common = q.overlap(s) 894 if len(common) >= min_overlap: 895 896 qprof = q.profile_at(sources[q.source_id], min(common), max(common)) 897 sprof = s.profile_at(sources[s.source_id], min(common), max(common)) 898 899 #score = qhmm.emission_similarity(shmm) 900 assert len(qprof) == len(sprof) 901 dots = [ numpy.dot(qprof[i] / back, sprof[i] / back) for i in range(len(qprof)) ] 902 score = numpy.log(numpy.prod(dots)) 903 if score is not None: 904 scores.append(score) 905 return scores
906
907 - def _entropy(self, data, binsize):
908 909 binsize = float(binsize) 910 bins = numpy.ceil(numpy.array(data) / binsize) 911 912 hist = dict.fromkeys(bins, 0) 913 for bin in bins: 914 hist[bin] += (1.0 / len(bins)) 915 916 freq = numpy.array(hist.values()) 917 return - numpy.sum(freq * numpy.log(freq))
918
919 - def rmsd_entropy(self, binsize=0.1):
920 921 rmsds = self.pairwise_rmsd() 922 return self._entropy(rmsds, binsize)
923
924 - def score_entropy(self, profiles='.', binsize=1):
925 926 scores = self.pairwise_scores(profiles) 927 return self._entropy(scores, binsize)
928
929 - def rmsd_consistency(self, threshold=1.5):
930 931 rmsds = self.pairwise_rmsd() 932 933 if len(rmsds) < 1: 934 return None 935 936 return sum([1 for i in rmsds if i <= threshold]) / float(len(rmsds))
937
938 - def sa_rmsd_consistency(self, threshold=0.4, profiles='.'):
939 940 sa_rmsds = self.pairwise_sa_rmsd(profiles=profiles) 941 942 if len(sa_rmsds) < 1: 943 return None 944 945 return sum([1 for i in sa_rmsds if i <= threshold]) / float(len(sa_rmsds))
946
947 - def true_positives(self, threshold=1.5):
948 949 if self.assignments.length < 1: 950 return None 951 952 return sum([1 for i in self.assignments if i.rmsd <= threshold]) / float(self.assignments.length)
953
954 - def confidence(self):
955 956 cons = self.rmsd_consistency() 957 958 if cons is None: 959 return 0 960 else: 961 return numpy.log10(self.count) * cons
962
963 -class ResidueAssignmentInfo(object):
964
965 - def __init__(self, assignment, rank):
966 967 if not assignment.qstart <= rank <= assignment.qend: 968 raise ValueError('Rank {0} is not matched by this assignment') 969 970 self._assignment = assignment 971 self._rank = rank 972 self._relrank = rank - assignment.qstart
973 974 @property
975 - def c_alpha(self):
976 return self._assignment.backbone[self._relrank]
977 978 @property
979 - def fragment(self):
980 return self._assignment
981
982 -class Assignment(FragmentMatch):
983 """ 984 Represents a match between a fragment and its target. 985 986 @param source: source structure (must have torsion angles precomputed) 987 @type source: L{csb.bio.structure.Chain} 988 @param start: start position in C{source} (rank) 989 @type start: int 990 @param end: end position in C{source} (rank) 991 @type end: int 992 @param id: fragment ID 993 @type id: str 994 @param qstart: start position in target (rank) 995 @type qstart: int 996 @param qend: end position in target (rank) 997 @type qend: int 998 @param probability: probability of assignment 999 @type probability: float 1000 @param rmsd: RMSD of the fragment, compared to target's native structure 1001 @type rmsd: float 1002 """ 1003
1004 - def __init__(self, source, start, end, qstart, qend, id=None, probability=None, rmsd=None, 1005 tm_score=None, score=None, neff=None, segment=None, internal_id=None):
1006 1007 assert source.has_torsion 1008 sub = source.subregion(start, end, clone=True) 1009 try: 1010 calpha = [r.atoms['CA'].vector.copy() for r in sub.residues] 1011 except csb.core.ItemNotFoundError: 1012 raise csb.bio.structure.Broken3DStructureError() 1013 torsion = [r.torsion.copy() for r in sub.residues] 1014 1015 self._calpha = csb.core.ReadOnlyCollectionContainer(items=calpha, type=numpy.ndarray) 1016 self._torsion = torsion 1017 self._sequence = sub.sequence 1018 1019 self._source_id = source.accession[:4] + source.id 1020 self._start = start 1021 self._end = end 1022 1023 self._score = score 1024 self._neff = neff 1025 self._ss = None 1026 1027 self._segment_start = segment 1028 self.internal_id = internal_id 1029 1030 if id is None: 1031 id = "{0}:{1}-{2}".format(self.source_id, self.start, self.end) 1032 1033 super(Assignment, self).__init__(id, qstart, qend, probability, rmsd, tm_score, None) 1034 1035 self._ss = SecondaryStructure('-' * self.length)
1036 1037 @staticmethod
1038 - def from_fragment(fragment, provider):
1039 """ 1040 Create a new L{Assignment} given a source rosetta fragment. 1041 1042 @param fragment: rosetta fragment 1043 @type fragment: L{RosettaFragment} 1044 @param provider: PDB database provider 1045 @type provider: L{StructureProvider} 1046 1047 @rtype: L{Assignment} 1048 """ 1049 try: 1050 structure = provider.get(fragment.accession) 1051 except KeyError: 1052 structure = provider.get(fragment.source_id) 1053 source = structure.chains[fragment.chain] 1054 source.compute_torsion() 1055 1056 id = "{0}:{1}-{2}".format(fragment.source_id, fragment.start, fragment.end) 1057 1058 return Assignment(source, fragment.start, fragment.end, 1059 fragment.qstart, fragment.qend, id, 0, 0)
1060 1061 @property
1062 - def backbone(self):
1063 return self._calpha
1064 1065 @property
1066 - def sequence(self):
1067 return self._sequence
1068 1069 @property
1070 - def torsion(self):
1071 return self._torsion
1072 1073 @property
1074 - def source_id(self):
1075 return self._source_id
1076 1077 @property
1078 - def start(self):
1079 return self._start
1080 1081 @property
1082 - def end(self):
1083 return self._end
1084 1085 @property
1086 - def score(self):
1087 return self._score
1088 1089 @property
1090 - def neff(self):
1091 return self._neff
1092 1093 @property
1094 - def segment(self):
1095 return self._segment_start
1096 1097 @property
1098 - def secondary_structure(self):
1099 return self._ss
1100 @secondary_structure.setter
1101 - def secondary_structure(self, value):
1102 1103 if isinstance(value, csb.core.string): 1104 value = csb.bio.structure.SecondaryStructure(value) 1105 if len(str(value)) != self.length:#(value.end - value.start + 1) != self.length: 1106 raise ValueError("Invalid secondary structure length", len(str(value)), self.length ) 1107 1108 self._ss = value
1109
1110 - def transform(self, rotation, translation):
1111 """ 1112 Apply rotation/translation to fragment's coordinates in place. 1113 """ 1114 1115 for ca in self.backbone: 1116 newca = numpy.dot(ca, numpy.transpose(rotation)) + translation 1117 for i in range(3): 1118 ca[i] = newca[i]
1119
1120 - def _check_range(self, qstart, qend):
1121 1122 if not (self.qstart <= qstart <= qend <= self.qend): 1123 raise ValueError('Region {0}..{1} is out of range {2.qstart}..{2.qend}'.format(qstart, qend, self))
1124
1125 - def anchored_around(self, rank):
1126 """ 1127 @return: True if the fragment is centered around position=C{rank}. 1128 @rtype: bool 1129 """ 1130 1131 if self.qstart < rank < self.qend: 1132 if (rank - self.qstart + 1) > 0.4 * (self.qend - self.qstart + 1): 1133 return True 1134 1135 return False
1136
1137 - def backbone_at(self, qstart, qend):
1138 """ 1139 @return: the CA coordinates of the fragment at the specified subregion. 1140 @rtype: list 1141 """ 1142 1143 self._check_range(qstart, qend) 1144 1145 relstart = qstart - self.qstart 1146 relend = qend - self.qstart + 1 1147 1148 return self.backbone[relstart : relend]
1149
1150 - def torsion_at(self, qstart, qend):
1151 """ 1152 @return: the torsion angles of the fragment at the specified subregion. 1153 @rtype: list 1154 """ 1155 1156 self._check_range(qstart, qend) 1157 1158 relstart = qstart - self.qstart 1159 relend = qend - self.qstart + 1 1160 1161 return self.torsion[relstart : relend]
1162
1163 - def solvent_at(self, sa_string, qstart, qend):
1164 1165 self._check_range(qstart, qend) 1166 1167 relstart = qstart - self.qstart 1168 relend = qend - self.qstart + 1 1169 1170 return sa_string[relstart : relend]
1171
1172 - def sec_structure_at(self, qstart, qend):
1173 1174 self._check_range(qstart, qend) 1175 start = qstart - self.qstart + 1 1176 end = qend - self.qstart + 1 1177 1178 return self.secondary_structure.scan(start, end, loose=True, cut=True)
1179
1180 - def profile_at(self, source, qstart, qend):
1181 1182 self._check_range(qstart, qend) 1183 1184 start = qstart - self.qstart + self.start 1185 end = qend - self.qstart + self.start 1186 1187 if hasattr(source, 'subregion'): 1188 return source.subregion(start, end) 1189 else: 1190 return source[start - 1 : end]
1191
1192 - def chain_at(self, source, qstart, qend):
1193 1194 self._check_range(qstart, qend) 1195 1196 start = qstart - self.qstart + self.start 1197 end = qend - self.qstart + self.start 1198 1199 return source.subregion(start, end)
1200
1201 - def overlap(self, other):
1202 """ 1203 @type other: L{Assignment} 1204 @return: target positions, covered by both C{self} and C{other} 1205 @rtype: set of int 1206 """ 1207 1208 qranks = set(range(self.qstart, self.qend + 1)) 1209 sranks = set(range(other.qstart, other.qend + 1)) 1210 1211 return qranks.intersection(sranks)
1212
1213 - def rmsd_to(self, other, min_overlap=5):
1214 """ 1215 @return: the CA RMSD between C{self} and C{other}. 1216 1217 @param other: another fragment 1218 @type other: L{Assignment} 1219 @param min_overlap: require at least that number of overlapping residues 1220 (return None if not satisfied) 1221 @type min_overlap: int 1222 1223 @rtype: float 1224 """ 1225 if self is other: 1226 return 0 1227 1228 common = self.overlap(other) 1229 1230 if len(common) >= min_overlap: 1231 1232 qstart, qend = min(common), max(common) 1233 1234 q = self.backbone_at(qstart, qend) 1235 s = other.backbone_at(qstart, qend) 1236 1237 if len(q) > 0 and len(s) > 0: 1238 return csb.bio.utils.rmsd(numpy.array(q), numpy.array(s)) 1239 1240 return None
1241
1242 - def nrmsd_to(self, other, min_overlap=5):
1243 1244 common = self.overlap(other) 1245 1246 if len(common) >= min_overlap: 1247 1248 qstart, qend = min(common), max(common) 1249 1250 q = self.backbone_at(qstart, qend) 1251 s = other.backbone_at(qstart, qend) 1252 1253 if len(q) > 0 and len(s) > 0: 1254 return csb.bio.utils.rmsd(q, s) / RANDOM_RMSD[ len(common) ] 1255 1256 return None
1257
1258 - def mda_to(self, other, min_overlap=5):
1259 1260 common = self.overlap(other) 1261 1262 if len(common) >= min_overlap: 1263 1264 qstart, qend = min(common), max(common) 1265 1266 q = self.torsion_at(qstart, qend) 1267 s = other.torsion_at(qstart, qend) 1268 1269 if len(q) > 0 and len(s) > 0: 1270 1271 maxphi = max(numpy.abs(i.phi - j.phi) for i, j in zip(q, s)[1:]) # phi: 2 .. L 1272 maxpsi = max(numpy.abs(i.psi - j.psi) for i, j in zip(q, s)[:-1]) # psi: 1 .. L-1 1273 1274 return max(maxphi, maxpsi) 1275 1276 return None
1277
1278 - def to_rosetta(self, source, qstart=None, qend=None, weight=None):
1279 """ 1280 @deprecated: this method will be deleted soon. Use 1281 L{csb.bio.fragments.rosetta.OutputBuilder} instead. 1282 """ 1283 stream = csb.io.MemoryStream() 1284 1285 if weight is None: 1286 weight = self.probability 1287 if not qstart: 1288 qstart = self.qstart 1289 if not qend: 1290 qend = self.qend 1291 1292 source.compute_torsion() 1293 chain = self.chain_at(source, qstart, qend) 1294 1295 for i, r in enumerate(chain.residues): 1296 1297 acc = self.source_id[:4] 1298 ch = self.source_id[4].upper() 1299 1300 start = qstart - self.qstart + self.start + i 1301 aa = r.type 1302 ss = 'L' 1303 phi, psi, omega = 0, 0, 0 1304 if r.torsion.phi: 1305 phi = r.torsion.phi 1306 if r.torsion.psi: 1307 psi = r.torsion.psi 1308 if r.torsion.omega: 1309 omega = r.torsion.omega 1310 1311 stream.write(' {0:4} {1:1} {2:>5} {3!s:1} {4!s:1} {5:>8.3f} {6:>8.3f} {7:>8.3f} {8:>8.3f}\n'.format(acc, ch, start, aa, ss, phi, psi, omega, weight)) 1312 1313 return stream.getvalue()
1314
1315 -class ChemShiftAssignment(Assignment):
1316 1317 BIT_SCORE_THRESHOLD = 1.1 1318
1319 - def __init__(self, source, start, end, qstart, qend, window, score, rmsd):
1320 1321 self._window = window 1322 1323 super(ChemShiftAssignment, self).__init__( 1324 source, start, end, qstart, qend, id=None, probability=1.0, 1325 rmsd=rmsd, tm_score=None, score=score, neff=None, segment=None, internal_id=None)
1326 1327 @property
1328 - def window(self):
1329 return self._window
1330
1331 -class ClusterExhaustedError(ValueError):
1332 pass
1333
1334 -class ClusterEmptyError(ClusterExhaustedError):
1335 pass
1336
1337 -class ClusterDivergingError(RuntimeError):
1338 pass
1339
1340 -class FragmentCluster(object):
1341 """ 1342 Provides clustering/filtering of the fragments, covering a common residue 1343 in the target. Clustering is done via iterative shrinking of the cluster. 1344 At each iteration, node rejection (deletion) is attempted for each node. The 1345 node rejection, causing the most significant drop in the average pairwise 1346 distance (RMSD) in the cluster, is retained. This procedure is repeated 1347 until: 1) the average pairwise RMSD drops below the C{threshold} (converged), 1348 2) the cluster gets exhausted or 3) node rejection no longer 1349 causes a drop in the average distance (not converging). 1350 1351 @param items: cluster members 1352 @type items: iterable of L{ClusterNode}s 1353 @param threshold: RMSD threshold; continue shrinking until the mean distance 1354 drops below this value (default=1.5) 1355 @type threshold: float 1356 @param connectedness: when calculating centroids, consider only nodes 1357 connected to at least c% of all surviving vertices 1358 (default=0.5) 1359 @type connectedness: float 1360 """ 1361 1362 MIN_LENGTH = 6 1363
1364 - def __init__(self, items, threshold=1.5, connectedness=0.5):
1365 1366 items = set(i for i in items if i.fragment.length >= FragmentCluster.MIN_LENGTH) 1367 1368 self._matrix = {} 1369 self._threshold = float(threshold) 1370 self._connectedness = float(connectedness) 1371 self._weight = 0 1372 self._edges = 0 1373 1374 visited = set() 1375 1376 for i in items: 1377 self._matrix.setdefault(i, {}) 1378 1379 for j in items: 1380 self._matrix.setdefault(j, {}) 1381 1382 if (j, i) not in visited: 1383 visited.add((i, j)) 1384 distance = i.distance(j) 1385 1386 if distance is not None: 1387 self._matrix[i][j] = distance 1388 self._matrix[j][i] = distance 1389 i.weight += distance 1390 j.weight += distance 1391 1392 self._weight += distance 1393 self._edges += 1 1394 1395 self._items = set(self._matrix.keys()) 1396 1397 if len(self._items) < 1: 1398 raise ClusterEmptyError() 1399 1400 self._initcount = self.count
1401 1402 @property
1403 - def count(self):
1404 return len(self._items)
1405 1406 @property
1407 - def items(self):
1408 return tuple(self._items)
1409 1410 @property
1411 - def fragments(self):
1412 return tuple(i.fragment for i in self._items)
1413 1414 @property
1415 - def threshold(self):
1416 return self._threshold
1417 @threshold.setter
1418 - def threshold(self, value):
1419 self._threshold = float(value)
1420 1421 @property
1422 - def connectedness(self):
1423 return self._connectedness
1424
1425 - def _distances(self, skip=None):
1426 1427 d = [] 1428 1429 for i in self._matrix: 1430 if skip is i: 1431 continue 1432 1433 for j in self._matrix[i]: 1434 if skip is not j: 1435 d.append(self._matrix[i][j]) 1436 1437 return d
1438
1439 - def _distance(self, i, j):
1440 1441 if j in self._matrix[i]: 1442 return self._matrix[i][j] 1443 else: 1444 return None
1445
1446 - def mean(self, skip=None):
1447 """ 1448 @return: the current mean distance in the cluster 1449 @rtype: float 1450 """ 1451 if self._edges == 0: 1452 raise ClusterExhaustedError() 1453 1454 if not skip: 1455 return float(self._weight) / self._edges 1456 1457 else: 1458 weight = self._weight - skip.weight 1459 edges = self._edges - len(self._matrix[skip]) 1460 1461 if edges < 1: 1462 return 0 1463 else: 1464 return float(weight) / edges
1465
1466 - def centroid(self):
1467 """ 1468 @return: the current representative fragment 1469 @rtype: L{ClusterRep} 1470 1471 @note: the cluster rep is the node with the lowest average distance 1472 to all other nodes. If a fixed fragment exists, structurally similar 1473 to the rep, but longer, this fragment may be suggested as an alternative 1474 (see also L{ClusterRep}). 1475 """ 1476 1477 alt = None 1478 cen = None 1479 avg = None 1480 1481 for i in self._matrix: 1482 edges = len(self._matrix[i]) or (1.0 / self.count) 1483 curravg = float(i.weight) / edges 1484 conn = len(self._matrix[i]) / float(self.count) 1485 1486 if avg is None or (curravg < avg and conn >= self.connectedness): 1487 avg = curravg 1488 cen = i 1489 elif curravg == avg: 1490 if i.fragment.length > cen.fragment.length: 1491 cen = i 1492 1493 d = self._distances() 1494 mean = numpy.mean(d) 1495 cons = sum(1.0 for i in d if i <= self.threshold) / len(d) 1496 1497 for i in self._matrix: 1498 if i is not cen and i.fixed and i.fragment.length > cen.fragment.length: 1499 distance = self._distance(i, cen) 1500 if distance is not None and distance < 0.5 * self.threshold: 1501 if alt is None or alt.fragment.length < i.fragment.length: 1502 alt = i 1503 1504 return ClusterRep(cen, mean, cons, len(self._matrix[cen]), alternative=alt, 1505 rejections=(self._initcount - self.count))
1506
1507 - def reject(self, item):
1508 """ 1509 Remove C{item} from the cluster. 1510 1511 @type item: L{ClusterNode} 1512 @raise ClusterExhaustedError: if this is the last remaining item 1513 """ 1514 if self.count == 1: 1515 raise ClusterExhaustedError() 1516 1517 assert not item.fixed 1518 1519 for i in self._matrix: 1520 if item in self._matrix[i]: 1521 distance = self._matrix[i][item] 1522 self._weight -= distance 1523 i.weight -= distance 1524 1525 del self._matrix[i][item] 1526 self._edges -= 1 1527 1528 del self._matrix[item] 1529 self._items.remove(item)
1530
1531 - def shrinkone(self):
1532 """ 1533 Shrink the cluster by a single node. 1534 1535 @return: True on successful shrink, False otherwise (e.g. if 1536 already converged) 1537 @rtype: bool 1538 @raise ClusterExhaustedError: if exhausted 1539 @raise ClusterDivergingError: if not converging 1540 """ 1541 1542 mean = self.mean() 1543 if mean <= self.threshold or self.count == 1: 1544 return False # already shrunk enough 1545 1546 m = {} 1547 1548 for i in self._matrix: 1549 if not i.fixed: 1550 newmean = self.mean(skip=i) 1551 m[newmean] = i 1552 1553 if len(m) == 0: # only fixed items remaining 1554 raise ClusterExhaustedError() 1555 1556 newmean = min(m) 1557 1558 if newmean > mean: 1559 raise ClusterDivergingError() # can't converge, usually when fixed items are too far away from the average 1560 elif newmean < mean: 1561 junk = m[newmean] 1562 self.reject(junk) 1563 return True # successful shrink 1564 else: 1565 return False # converged
1566
1567 - def shrink(self, minitems=2):
1568 """ 1569 Start automatic shrinking. 1570 1571 @param minitems: absolute minimum of the number of nodes in the cluster 1572 @type minitems: int 1573 1574 @return: cluster's representative: the node with the lowest average 1575 distance to all other nodes in the cluster 1576 @rtype: L{ClusterRep} 1577 1578 @raise ClusterExhaustedError: if C{self.count} < C{minitems} and 1579 still not converged 1580 """ 1581 1582 if self.count > minitems: 1583 1584 while self.shrinkone(): 1585 if self.count <= minitems: 1586 raise ClusterExhaustedError() 1587 else: 1588 raise ClusterExhaustedError() 1589 1590 return self.centroid()
1591
1592 -class ClusterNode(object):
1593 """ 1594 Cluster node. 1595 1596 @param fragment: fragment 1597 @type fragment: L{Assignment} 1598 @param distance: distance metric (a L{Metrics} member, default is RMSD) 1599 @type distance: str 1600 @param fixed: mark this node as fixed (cannot be rejected) 1601 @type fixed: bool 1602 """ 1603 1604 FIXED = 0.7 1605 1606 @staticmethod
1607 - def create(fragment, method=Metrics.RMSD, extend=False):
1608 """ 1609 Create a new L{ClusterNode} given a specified C{Assignment}. If this 1610 assignment is a high probability match, define it as a fixed fragment. 1611 1612 @rtype: L{ClusterNode} 1613 """ 1614 if fragment.probability > ClusterNode.FIXED and fragment.length >= FragmentCluster.MIN_LENGTH: 1615 return ClusterNode(fragment, distance=method, fixed=extend) 1616 else: 1617 return ClusterNode(fragment, distance=method, fixed=False)
1618
1619 - def __init__(self, fragment, distance=Metrics.RMSD, fixed=False):
1620 1621 if fixed and fragment.length < FragmentCluster.MIN_LENGTH: 1622 raise ValueError("Can't fix a short fragment") 1623 1624 self.fragment = fragment 1625 self.fixed = bool(fixed) 1626 self.weight = 0 1627 1628 self._distance = getattr(self.fragment, distance)
1629
1630 - def distance(self, other):
1631 """ 1632 @return: the distance between self and another node 1633 @type other: L{ClusterNode} 1634 @rtype: float 1635 """ 1636 return self._distance(other.fragment)
1637
1638 -class ClusterRep(object):
1639 """ 1640 Cluster's representative (centroid) node. This object carries the 1641 result of shrinking itself. 1642 1643 @param centroid: rep node 1644 @type centroid: L{ClusterNode} 1645 @param mean: current mean distance in the cluster 1646 @type mean: float 1647 @param consistency: percentage of pairwise distances below the RMSD C{threshold} 1648 @type consistency: float 1649 @param count: current number of nodes in the cluster 1650 @type count: int 1651 @param rejections: total number of rejections 1652 @type rejections: int 1653 @param alternative: suggested cluster rep alternative (e.g. structurally 1654 similar to the centroid, but longer) 1655 @type param: 1656 """ 1657
1658 - def __init__(self, centroid, mean, consistency, count, rejections=0, alternative=None):
1659 1660 if isinstance(centroid, ClusterNode): 1661 centroid = centroid.fragment 1662 if isinstance(alternative, ClusterNode): 1663 alternative = alternative.fragment 1664 1665 self._centroid = centroid 1666 self._alternative = alternative 1667 self._mean = mean 1668 self._consistency = consistency 1669 self._count = count 1670 self._rejections = rejections
1671 1672 @property
1673 - def confidence(self):
1674 """ 1675 Confidence of assignment: log10(count) * consistency 1676 """ 1677 if self.count <= 0 or self.count is None or self.consistency is None: 1678 return 0 1679 else: 1680 return numpy.log10(self.count) * self.consistency
1681 1682 @property
1683 - def centroid(self):
1684 return self._centroid
1685 1686 @property
1687 - def alternative(self):
1688 return self._alternative
1689 1690 @property
1691 - def has_alternative(self):
1692 return self._alternative is not None
1693 1694 @property
1695 - def mean(self):
1696 return self._mean
1697 1698 @property
1699 - def consistency(self):
1700 return self._consistency
1701 1702 @property
1703 - def count(self):
1704 return self._count
1705 1706 @property
1707 - def rejections(self):
1708 return self._rejections
1709
1710 - def exchange(self):
1711 """ 1712 If an alternative is available, swap the centroid and the alternative. 1713 """ 1714 1715 if self._alternative is not None: 1716 1717 centroid = self._centroid 1718 self._centroid = self._alternative 1719 self._alternative = centroid
1720
1721 - def to_rosetta(self, source):
1722 """ 1723 @deprecated: this method is obsolete and will be deleted soon 1724 """ 1725 return self.centroid.to_rosetta(source, weight=self.confidence)
1726
1727 -class AdaptedAssignment(object):
1728 1729 @staticmethod
1730 - def with_overhangs(center, start, end, overhang=1):
1731 1732 if center.centroid.qstart <= (start - overhang): 1733 start -= overhang 1734 elif center.centroid.qstart < start: 1735 start = center.centroid.qstart 1736 1737 if center.centroid.qend >= (end + overhang): 1738 end += overhang 1739 elif center.centroid.qend > end: 1740 end = center.centroid.end 1741 1742 return AdaptedAssignment(center, start, end)
1743
1744 - def __init__(self, center, qstart, qend):
1745 1746 if qstart < center.centroid.qstart: 1747 raise ValueError(qstart) 1748 if qend > center.centroid.qend: 1749 raise ValueError(qend) 1750 1751 self._qstart = qstart 1752 self._qend = qend 1753 self._center = center
1754 1755 @property
1756 - def fragment(self):
1757 return self._center.centroid
1758 1759 @property
1760 - def center(self):
1761 return self._center
1762 1763 @property
1764 - def confidence(self):
1765 return self._center.confidence
1766 1767 @property
1768 - def qstart(self):
1769 return self._qstart
1770 1771 @property
1772 - def qend(self):
1773 return self._qend
1774 1775 @property
1776 - def backbone(self):
1777 return self.fragment.backbone_at(self.qstart, self.qend)
1778
1779 - def chain(self, source):
1780 return self.fragment.chain_at(source, self.qstart, self.qend)
1781
1782 - def to_rosetta(self, source):
1783 return self.fragment.to_rosetta(source, self.qstart, self.qend, self.confidence)
1784
1785 -class SmoothFragmentMap(csb.core.AbstractContainer):
1786
1787 - def __init__(self, length, centroids):
1788 1789 if not length > 0: 1790 raise ValueError(length) 1791 1792 self._length = int(length) 1793 self._slots = set(range(1, self._length + 1)) 1794 self._map = {} 1795 1796 centers = list(centroids) 1797 centers.sort(key=lambda i: i.confidence, reverse=True) 1798 1799 for c in centers: 1800 self.assign(c)
1801 1802 @property
1803 - def _children(self):
1804 return self._map
1805
1806 - def assign(self, center):
1807 1808 for r in range(center.centroid.qstart, center.centroid.qend + 1): 1809 if r in self._slots: 1810 self._map[r] = center 1811 self._slots.remove(r)
1812
1813 - def patches(self):
1814 1815 center = None 1816 start = None 1817 end = None 1818 1819 for r in range(1, self._length + 1): 1820 1821 if center is None: 1822 if r in self._map: 1823 center = self._map[r] 1824 start = end = r 1825 else: 1826 center = None 1827 start = end = None 1828 else: 1829 if r in self._map: 1830 if self._map[r] is center: 1831 end = r 1832 else: 1833 yield AdaptedAssignment(center, start, end) 1834 center = self._map[r] 1835 start = end = r 1836 else: 1837 yield AdaptedAssignment(center, start, end) 1838 center = None 1839 start = end = None
1840
1841 1842 -class ResidueEventInfo(object):
1843
1844 - def __init__(self, residue, confidence=0, count=0, confident=True, gap=False, rep=None):
1845 1846 self.residue = residue 1847 self.confidence = confidence 1848 self.confident = confident 1849 self.gap = gap 1850 self.count = count 1851 self.rep = rep
1852 1853 @property
1854 - def rank(self):
1855 return self.residue.rank
1856 1857 @property
1858 - def type(self):
1859 return self.residue.type
1860 1861 @property
1862 - def torsion(self):
1863 if self.rep: 1864 return self.rep.torsion_at(self.rank, self.rank)[0] 1865 else: 1866 return None
1867
1868 1869 -class RosettaFragsetFactory(object):
1870 """ 1871 Simplifies the construction of fragment libraries. 1872 """ 1873
1874 - def __init__(self):
1875 import csb.bio.fragments.rosetta as rosetta 1876 self.rosetta = rosetta
1877
1878 - def make_fragset(self, target):
1879 """ 1880 Build a fragment library given a L{Target} and its L{Assignment}s. 1881 1882 @param target: target protein 1883 @type target: L{Target} 1884 1885 @rtype: L{RosettaFragmentMap} 1886 """ 1887 1888 frag_factory = self.rosetta.RosettaFragment 1889 fragments = list(map(frag_factory.from_object, target.matches)) 1890 #fragments = [ frag_factory.from_object(f) for f in target.matches if f.length >= 6 ] 1891 fragments.sort() 1892 1893 return self.rosetta.RosettaFragmentMap(fragments, target.length)
1894
1895 - def make_chopped(self, fragments, window):
1896 """ 1897 Build a fixed-length fragment library from a list of 1898 variable-length L{Assignment}s. 1899 1900 @param fragments: source fragments 1901 @type fragments: iterable of L{RosettaFragment}s 1902 @param window: fixed-length fragment size (for classic Rosetta: choose 9) 1903 @type window: int 1904 1905 @return: fixed-length fragment library 1906 @rtype: L{RosettaFragmentMap} 1907 """ 1908 1909 frags = [] 1910 1911 for f in fragments: 1912 for qs in range(f.qstart, f.qend - window + 1): 1913 frags.append(f.subregion(qs, qs + window - 1)) 1914 1915 return self.rosetta.RosettaFragmentMap(frags)
1916
1917 - def make_combined(self, target, filling, threshold=0.5, callback=None):
1918 """ 1919 Complement C{target}'s assignments with C{filling} (e.g. rosetta fragments). 1920 The regions to be complemented are determined by calculating the confidence 1921 at each residue (by filtering). 1922 1923 1924 @param target: target protein 1925 @type target: L{Target} 1926 @param filling: additional fragments to place in the low-conf regions 1927 @type filling: L{RosettaFragmentMap} or iterable of L{RosettaFragment} 1928 @param threshold: confidence threshold 1929 @type threshold: float 1930 1931 @return: complemented fragment library 1932 @rtype: L{RosettaFragmentMap} 1933 """ 1934 1935 fragmap = self.make_fragset(target) 1936 covered = set() 1937 1938 for r in target.residues: 1939 1940 if r.assignments.length == 0: 1941 if callback: 1942 callback(ResidueEventInfo(r.native, gap=True)) 1943 continue 1944 1945 cluster = r.filter() 1946 if cluster is None: 1947 if callback: 1948 callback(ResidueEventInfo(r.native, 0, 0, confident=False)) 1949 continue 1950 1951 if cluster.confidence >= threshold: 1952 covered.add(r.native.rank) 1953 confident = True 1954 else: 1955 confident = False 1956 1957 if callback: 1958 callback(ResidueEventInfo(r.native, cluster.confidence, cluster.count, confident)) 1959 1960 for r in target.residues: 1961 if r.native.rank not in covered: # true for gaps and low-conf residues 1962 fragmap.mark_unconfident(r.native.rank) 1963 1964 for frag in filling: 1965 fragmap.complement(frag) 1966 1967 return fragmap
1968
1969 - def make_filtered(self, target, extend=False, callback=None):
1970 """ 1971 Builed a filtered fragment library (by clustering), containing only 1972 representative fragments (cluster centroids). 1973 1974 @param target: target protein 1975 @type target: L{Target} 1976 @param extend: if True, pick alternative reps if available 1977 @type extend: bool 1978 1979 @return: filtered fragment library 1980 @rtype: L{RosettaFragmentMap} 1981 """ 1982 1983 fragments = [] 1984 1985 for r in target.residues: 1986 if r.assignments.length == 0: 1987 if callback: 1988 callback(ResidueEventInfo(r.native, gap=True)) 1989 continue 1990 1991 cluster = r.filter(extend=extend) 1992 if cluster is None: 1993 if callback: 1994 callback(ResidueEventInfo(r.native, 0, 0, confident=False)) 1995 continue 1996 1997 if extend and cluster.has_alternative: 1998 best = cluster.alternative 1999 else: 2000 best = cluster.centroid 2001 2002 fragment = self.rosetta.RosettaFragment.from_object(best) 2003 fragments.append(fragment) 2004 if callback: 2005 callback(ResidueEventInfo(r.native, cluster.confidence, cluster.count, rep=cluster.centroid)) 2006 2007 fragments.sort() 2008 return self.rosetta.RosettaFragmentMap(fragments, target.length)
2009
2010 - def mix(self, *fragsets):
2011 """ 2012 Mix fragments from multiple libraries. 2013 2014 @type fragsets: L{RosettaFragmentMap} 2015 @return: mixed fragment library 2016 @rtype: L{RosettaFragmentMap} 2017 """ 2018 2019 fragments = [] 2020 length = 0 2021 2022 for fragset in fragsets: 2023 if fragset._length > length: 2024 length = fragset._length 2025 2026 for fragment in fragset: 2027 fragments.append(fragment) 2028 2029 return self.rosetta.RosettaFragmentMap(fragments, length)
2030
2031 2032 -class BenchmarkAdapter(object):
2033
2034 - class Connection(object):
2035 2036 FACTORY = None 2037 DSN = None 2038
2039 - def __init__(self, factory=None, dsn=None):
2040 2041 self.factory = factory or self.__class__.FACTORY 2042 self.cs = dsn or self.__class__.DSN 2043 self.connection = None 2044 self.cursor = None
2045
2046 - def __enter__(self):
2047 2048 self.connection = self.factory(self.cs) 2049 try: 2050 self.cursor = self.connection.cursor() 2051 except: 2052 self.connection.close() 2053 raise 2054 return self
2055
2056 - def __exit__(self, *args):
2057 try: 2058 if not self.cursor.closed: 2059 self.cursor.close() 2060 finally: 2061 if not self.connection.closed: 2062 self.connection.close()
2063
2064 - def __init__(self, pdb_paths, connection_string=None, factory=AssignmentFactory()):
2065 2066 self._pdb = pdb_paths 2067 self._connection = None 2068 2069 from csb.bio.io.wwpdb import find, StructureParser 2070 self._parser = StructureParser 2071 self._find = find 2072 self._factory = factory 2073 2074 try: 2075 import psycopg2.extras 2076 except ImportError: 2077 raise RuntimeError('Please install the psycopg2 module first') 2078 2079 if connection_string is None: 2080 connection_string = self.connection_string() 2081 2082 BenchmarkAdapter.Connection.FACTORY = psycopg2.extras.DictConnection 2083 BenchmarkAdapter.Connection.DSN = connection_string
2084 2085 @staticmethod
2086 - def connection_string(database='FragmentBenchmarks', host='', username='', password=''):
2087 2088 fields = ['dbname={0}'.format(database)] 2089 2090 if host: 2091 fields.append('host={0}'.format(host)) 2092 if username: 2093 fields.append('user={0}'.format(username)) 2094 fields.append('password={0}'.format(password)) 2095 2096 return ' '.join(fields)
2097
2098 - def targets(self, benchmark_id):
2099 2100 with BenchmarkAdapter.Connection() as db: 2101 2102 db.cursor.callproc('reporting."GetTargets"', (benchmark_id,)) 2103 return db.cursor.fetchall()
2104
2105 - def target_details(self, target_id):
2106 2107 with BenchmarkAdapter.Connection() as db: 2108 2109 db.cursor.callproc('reporting."GetTargetDetails"', (target_id,)) 2110 return db.cursor.fetchall()
2111
2112 - def assignments(self, target_id, type):
2113 2114 with BenchmarkAdapter.Connection() as db: 2115 2116 db.cursor.callproc('reporting."GetAssignments"', (target_id, type)) 2117 return db.cursor.fetchall()
2118
2119 - def assignments_sec_structure(self, target_id, type):
2120 2121 with BenchmarkAdapter.Connection() as db: 2122 2123 db.cursor.callproc('reporting."GetTargetSecStructureAssignments2"', (target_id, type)) 2124 return db.cursor.fetchall()
2125
2126 - def scores(self, benchmark_id, type):
2127 2128 with BenchmarkAdapter.Connection() as db: 2129 2130 db.cursor.callproc('reporting."GetScores"', (benchmark_id, type)) 2131 return db.cursor.fetchall()
2132
2133 - def centroids(self, benchmark_id):
2134 2135 with BenchmarkAdapter.Connection() as db: 2136 2137 db.cursor.callproc('reporting."GetCentroids"', (benchmark_id,)) 2138 return db.cursor.fetchall()
2139
2140 - def target_segments(self, target_id):
2141 2142 with BenchmarkAdapter.Connection() as db: 2143 2144 db.cursor.callproc('reporting."GetTargetSegments"', (target_id,)) 2145 data = db.cursor.fetchall() 2146 2147 return [ TargetSegment(row['Start'], row['End'], row['Count']) for row in data ]
2148
2149 - def structure(self, accession, chain=None):
2150 2151 pdbfile = self._find(accession, self._pdb) 2152 2153 if not pdbfile and chain: 2154 pdbfile = self._find(accession + chain, self._pdb) 2155 2156 if not pdbfile: 2157 raise IOError('{0} not found here: {1}'.format(accession, self._pdb)) 2158 2159 return self._parser(pdbfile).parse_structure()
2160
2161 - def prediction(self, target_id, type, ss=False):
2162 2163 info = self.target_details(target_id) 2164 if not info: 2165 raise ValueError('No such Target ID in the database: {0}'.format(target_id)) 2166 row = info[0] 2167 2168 id = row["Accession"] 2169 length = float(row["Length"]) 2170 overlap = float(row["MaxOverlap"]) / (length or 1.) 2171 2172 native = self.structure(id[:4], id[4]).chains[id[4]] 2173 segments = self.target_segments(target_id) 2174 target = self._factory.target(id, length, native.residues, overlap, segments) 2175 2176 source = None 2177 2178 for row in self.assignments(target_id, type): 2179 2180 src_accession = row['Source'][:4] 2181 src_chain = row['Source'][4] 2182 2183 if source is None or source.accession != src_accession: 2184 try: 2185 source = self.structure(src_accession, src_chain) 2186 except (IOError, ValueError) as ex: 2187 target.errors.append(ex) 2188 continue 2189 2190 if src_chain == '_': 2191 frag_chain = source.first_chain 2192 else: 2193 frag_chain = source.chains[src_chain] 2194 if not frag_chain.has_torsion: 2195 frag_chain.compute_torsion() 2196 2197 fragment = self._factory.assignment( 2198 source=frag_chain, 2199 start=row['SourceStart'], 2200 end=row['SourceEnd'], 2201 id=row['FragmentName'], 2202 qstart=row['Start'], 2203 qend=row['End'], 2204 probability=row['Probability'], 2205 score=row['Score'], 2206 neff=row['Neff'], 2207 rmsd=row['RMSD'], 2208 tm_score=row['TMScore'], 2209 segment=row['SegmentStart'], 2210 internal_id=row['InternalID']) 2211 2212 target.assign(fragment) 2213 2214 if ss: 2215 self._attach_sec_structure(target, target_id, type) 2216 2217 return target
2218
2219 - def _attach_sec_structure(self, target, target_id, type):
2220 2221 ss = {} 2222 2223 for row in self.assignments_sec_structure(target_id, type): 2224 frag_id, state = row["AssignmentID"], row["DSSP"] 2225 if row[frag_id] not in ss: 2226 ss[frag_id] = [] 2227 2228 ss[frag_id].append(state) 2229 2230 for a in target.matches: 2231 if a.internal_id in ss: 2232 dssp = ''.join(ss[a.internal_id]) 2233 a.secondary_structure = dssp
2234