Package csb :: Package bio :: Package fragments
[frames] | no frames]

Source Code for Package csb.bio.fragments

   1  """ 
   2  APIs for working with protein structure fragments and libraries. 
   3   
   4  This package contains the nuts and bolts of HHfrag. Everything here revolves 
   5  around the L{Target} class, which describes a protein structure prediction 
   6  target. One typically assigns fragments (L{Assignment}s) to the target and then 
   7  builds a fragment library with L{RosettaFragsetFactory}. 
   8   
   9  @note: Internal or legacy objects are intentionally left undocumented. 
  10         This typically indicates experimental code. 
  11  """ 
  12   
  13  import os 
  14  import numpy 
  15   
  16  import csb.io 
  17  import csb.core 
  18  import csb.bio.utils 
  19  import csb.bio.structure 
  20  import csb.bio.sequence 
  21  from csb.bio.structure import SecondaryStructure 
22 23 24 -class FragmentTypes(object):
25 26 ISites = 'IS' 27 HMMFragments = 'HH' 28 HHThread = 'TH' 29 HHfrag = HHThread 30 Rosetta = 'NN'
31
32 -class Metrics(object):
33 34 RMSD = 'rmsd_to' 35 NORMALIZED_RMSD = 'nrmsd_to' 36 MDA = 'mda_to'
37 38 RANDOM_RMSD = { 5: 1.8749005857255376, 6: 2.4314283686276261, 7: 2.9021135267789608, 8: 3.2477716200172715, 9: 3.5469606556031708, 10: 3.8295465524456329, 39 11: 4.1343107114131783, 12: 4.3761697929053014, 13: 4.6707299668248394, 14: 4.9379016881069733, 15: 5.1809028645084911, 16: 5.4146957142595662, 40 17: 5.7135948448156988, 18: 5.9597935432566782, 19: 6.1337340535741962, 20: 6.3962825155503271, 21: 6.6107937773415166, 22: 6.8099096274123401, 41 23: 7.0435583846849639, 24: 7.2160956482560970, 25: 7.4547896324594962, 26: 7.6431870072434211, 27: 7.8727812194173836, 28: 8.0727393298443637, 42 29: 8.2551450998965326, 30: 8.4413583511786587, 31: 8.5958719774122052, 32: 8.7730435506242408, 33: 8.9970648837941649, 34: 9.1566521405105163, 43 35: 9.2828620878454728, 36: 9.4525824357923405, 37: 9.6322126445253300, 38: 9.7851684750961176, 39: 9.9891454649821476, 40: 10.124373939352028, 44 41: 10.284348528344765, 42: 10.390457305096271, 43: 10.565792044674239, 44: 10.676532740033737, 45: 10.789537132283652, 46: 11.004475543757550, 45 47: 11.064541647783571, 48: 11.231219875286985, 49: 11.319222637391441, 50: 11.485478165340824, 51: 11.607522494435521, 52: 11.700268836069840, 46 53: 11.831245255954073, 54: 11.918975893263905 }
47 48 -class FragmentMatch(object):
49 """ 50 Base class, representing a match between a fragment and its target. 51 """ 52
53 - def __init__(self, id, qstart, qend, probability, rmsd, tm_score, qlength):
54 55 self._id = id 56 self._qstart = qstart 57 self._qend = qend 58 self._probability = probability 59 self._rmsd = rmsd 60 self._tm_score = tm_score 61 self._qlength = qlength
62 63 @property
64 - def id(self):
65 return self._id
66 67 @property
68 - def qstart(self):
69 return self._qstart
70 71 @property
72 - def qend(self):
73 return self._qend
74 75 @property
76 - def qlength(self):
77 return self._qlength
78 79 @property
80 - def rmsd(self):
81 return self._rmsd
82 83 @property
84 - def tm_score(self):
85 return self._tm_score
86 87 @property
88 - def probability(self):
89 return self._probability
90 91 @property
92 - def length(self):
93 return self.qend - self.qstart + 1
94 95 @property
96 - def source_id(self):
97 raise NotImplementedError()
98 99 @property
100 - def start(self):
101 raise NotImplementedError()
102 103 @property
104 - def end(self):
105 raise NotImplementedError()
106
107 -class PredictionContainer(object):
108
109 - def __init__(self, target, isites_prediction, hmm_prediction, combined_prediction):
110 111 self.target = target 112 113 self.isites = isites_prediction 114 self.hmm = hmm_prediction 115 self.combined = combined_prediction
116
117 -class Prediction(object):
118
119 - def __init__(self, alignment, coordinates):
120 121 self.alignment = alignment 122 self.coordinates = coordinates
123
124 -class TorsionAnglesPredictor(object):
125 """ 126 Fragment-based phi/psi angles predictor. 127 128 @param target: target protein, containing fragment assignments 129 @type target: L{Target} 130 @param threshold: RMSD distance threshold for L{FragmentCluster}-based filtering 131 @type threshold: float 132 @param extend: pick alternative, longer cluster reps, if possible 133 @type extend: bool 134 @param init: populate all L{FragmentCluster}s on instantiation. If False, this step 135 will be performed on demand (the first time C{predictor.compute()} is invoked) 136 137 @note: if C{init} is False, the first call to C{predictor.compute()} might take a long 138 time. Subsequent calls will be very fast. 139 """ 140
141 - def __init__(self, target, threshold=1.5, extend=False, init=False):
142 143 if not isinstance(target, Target): 144 raise TypeError(target) 145 if target.matches.length == 0: 146 raise ValueError('This target has no fragment assignments') 147 148 self._target = target 149 self._threshold = float(threshold) 150 self._extend = bool(extend) 151 152 self._initialized = False 153 self._reps = {} 154 self._clusters = {} 155 156 if init: 157 self.init()
158 159 @property
160 - def target(self):
161 return self._target
162 163 @property
164 - def threshold(self):
165 return self._threshold
166 167 @property
168 - def extend(self):
169 return self._extend
170
171 - def init(self):
172 """ 173 Compute and cache all L{FragmentCluster}s. 174 """ 175 176 self._reps = {} 177 self._clusters = {} 178 179 for residue in self.target.residues: 180 cluster = self._filter(residue) 181 182 if cluster is not None: 183 rep = cluster.centroid() 184 if rep.has_alternative: 185 rep.exchange() 186 187 self._reps[residue.native.rank] = rep 188 self._clusters[residue.native.rank] = cluster.items 189 190 self._initialized = True
191
192 - def _filter(self, residue):
193 194 try: 195 nodes = [] 196 for ai in residue.assignments: 197 node = ClusterNode.create(ai.fragment) 198 nodes.append(node) 199 200 cluster = FragmentCluster(nodes, threshold=self.threshold) 201 cluster.shrink(minitems=0) 202 203 return cluster 204 205 except (ClusterExhaustedError, ClusterDivergingError): 206 return None
207
208 - def _residue(self, rank):
209 210 for r in self._target.residues: 211 if r.native.rank == rank: 212 return r 213 214 raise ValueError('Rank {0} is out of range'.format(rank))
215
216 - def compute_single(self, rank):
217 """ 218 Extract torsion angles from the L{ClusterRep} at residue C{#rank}. 219 220 @param rank: target residue rank 221 @type rank: int 222 223 @rtype: L{TorsionPredictionInfo} 224 """ 225 226 residue = self._residue(rank) 227 rep = residue.filter(threshold=self.threshold, extend=self.extend) 228 229 if rep is None: 230 return None 231 232 else: 233 fragment = rep.centroid 234 torsion = fragment.torsion_at(rank, rank)[0] 235 ss = fragment.sec_structure_at(rank, rank)[0] 236 237 return TorsionPredictionInfo(rank, rep.confidence, torsion, ss, primary=True)
238
239 - def compute(self, rank):
240 """ 241 Extract torsion angles from all L{ClusterRep}s, covering residue C{#rank}. 242 243 @param rank: target residue rank 244 @type rank: int 245 246 @return: L{TorsionPredictionInfo} instances, sorted by confidence 247 @rtype: tuple of L{TorsionPredictionInfo} 248 """ 249 250 if not self._initialized: 251 self.init() 252 253 prediction = [] 254 255 for rep in self._reps.values(): 256 257 if rep.centroid.qstart <= rank <= rep.centroid.qend: 258 259 fragment = rep.centroid 260 torsion = fragment.torsion_at(rank, rank)[0] 261 ss = fragment.sec_structure_at(rank, rank)[0] 262 info = TorsionPredictionInfo(rank, rep.confidence, torsion, ss) 263 264 if rep is self._reps.get(rank, None): 265 info.primary = True 266 267 prediction.append(info) 268 269 prediction.sort(reverse=True) 270 return tuple(prediction)
271
272 - def flat_torsion_map(self):
273 """ 274 Filter the current fragment map and create a new, completely flat, 275 non-overlapping map built from centroids, assigned iteratively by 276 decreasing confidence. Centroids with lower confidence which overlap 277 with previously assigned centroids will be trimmed to fill existing 278 gaps only. 279 280 @return: L{TorsionPredictionInfo} instances, one for each target residue 281 @rtype: tuple of L{TorsionPredictionInfo} 282 """ 283 284 if not self._initialized: 285 self.init() 286 287 prediction = [] 288 slots = set(range(1, self.target.length + 1)) 289 290 reps = list(self._reps.values()) 291 reps.sort(key=lambda i: i.confidence, reverse=True) 292 293 for rep in reps: 294 295 for rank in range(rep.centroid.qstart, rep.centroid.qend + 1): 296 if rank in slots: 297 torsion = rep.centroid.torsion_at(rank, rank)[0] 298 ss = rep.centroid.sec_structure_at(rank, rank)[0] 299 info = TorsionPredictionInfo(rank, rep.confidence, torsion, ss, primary=True) 300 301 prediction.append(info) 302 slots.remove(rank) 303 304 for rank in slots: 305 prediction.append(TorsionPredictionInfo(rank, 0, None)) 306 307 prediction.sort(key=lambda i: i.rank) 308 return tuple(prediction)
309
310 - def get_angles(self, rank):
311 """ 312 Extract all torsion angles coming from all fragments, which had survived 313 the filtering and cover residue C{#rank}. 314 315 @param rank: target residue rank 316 @type rank: int 317 318 @return: all L{TorsionAngles} for a cluster at the specified residue 319 @rtype: tuple of L{TorsionAngles} 320 """ 321 322 if not self._initialized: 323 self.init() 324 if rank not in self._clusters: 325 return tuple() 326 327 angles = [] 328 329 for node in self._clusters[rank]: 330 fragment = node.fragment 331 torsion = fragment.torsion_at(rank, rank)[0] 332 angles.append(torsion) 333 334 return tuple(angles)
335
336 337 -class TorsionPredictionInfo(object):
338 """ 339 Struct container for a single torsion angle prediction. 340 341 @param rank: target residue rank 342 @type rank: int 343 @param confidence: confidence of prediction 344 @type confidence: float 345 @param torsion: assigned phi/psi/omega angles 346 @type torsion: L{TorsionAngles} 347 @param dssp: assigned secondary structure 348 @type dssp: L{SecondaryStructureElement} 349 @param primary: if True, designates that the assigned angles are extracted 350 from the L{ClusterRep} at residue C{#rank}; otherwise: the 351 angles are coming from another, overlapping L{ClusterRep} 352 353 """ 354
355 - def __init__(self, rank, confidence, torsion, dssp, primary=False):
356 357 self.rank = rank 358 self.confidence = confidence 359 self.torsion = torsion 360 self.primary = primary 361 self.dssp = dssp
362
363 - def as_tuple(self):
364 """ 365 @return: convert this prediction to a tuple: (confidence, phi, psi, omega) 366 @rtype: tuple 367 """ 368 return tuple([self.confidence, self.torsion.phi, self.torsion.psi, self.torsion.omega])
369
370 - def __str__(self):
371 return '<TorsionPredictionInfo: {0.confidence:6.3f} at #{0.rank}>'.format(self)
372
373 - def __lt__(self, other):
374 return self.confidence < other.confidence
375
376 377 -class AssignmentFactory(object):
378
379 - def target(self, *a, **k):
380 return Target(*a, **k)
381
382 - def residue(self, *a, **k):
383 return TargetResidue(*a, **k)
384
385 - def assignment(self, *a, **k):
386 return Assignment(*a, **k)
387
388 -class Target(csb.core.AbstractNIContainer):
389 """ 390 Represents a protein structure prediction target. 391 392 @param id: target sequence ID, in PDB accnC format 393 @type id: str 394 @param length: total target sequence length 395 @type length: int 396 @param residues: a list, containing target's residues. See also 397 L{Target.from_sequence} 398 @type residues: iterable of L{csb.bio.structure.ProteinResidue}s 399 """ 400
401 - def __init__(self, id, length, residues, overlap=None, segments=None, factory=AssignmentFactory()):
402 403 self._id = id 404 self._accession = id[:-1] 405 self._chain_id = id[-1] 406 self._length = length 407 self._overlap = overlap 408 self._factory = factory 409 410 self._assignments = csb.core.ReadOnlyCollectionContainer(type=Assignment) 411 self._errors = csb.core.CollectionContainer() 412 413 resi = [factory.residue(native) for native in residues] 414 self._residues = csb.core.ReadOnlyCollectionContainer(items=resi, 415 type=TargetResidue, start_index=1) 416 417 if segments is not None: 418 segments = dict([(s.start, s) for s in segments]) 419 self._segments = csb.core.ReadOnlyDictionaryContainer(items=segments)
420 421 @staticmethod
422 - def from_sequence(id, sequence):
423 """ 424 Factory, which builds L{Target} objects from a bare sequence. 425 426 @param sequence: target's sequence 427 @type sequence: L{csb.bio.sequence.AbstractSequence}, str or iterable 428 429 @rtype: L{Target} 430 """ 431 432 if isinstance(sequence, csb.bio.sequence.Sequence): 433 sequence = sequence.sequence 434 435 residues = [] 436 437 for rn, aa in enumerate(sequence, start=1): 438 residue = csb.bio.structure.ProteinResidue(rank=rn, type=aa) 439 residues.append(residue) 440 441 return Target(id, len(residues), residues)
442 443 @staticmethod
444 - def from_profile(hmm):
445 """ 446 Factory, which builds L{Target} objects from an HMM profile. 447 448 @param hmm: target's HMM 449 @type hmm: L{csb.bio.hmm.ProfileHMM} 450 451 @rtype: L{Target} 452 """ 453 454 residues = [ r.clone() for r in hmm.residues ] 455 return Target(hmm.id, hmm.layers.length, residues)
456 457 @staticmethod
458 - def deserialize(pickle):
459 460 with open(pickle) as stream: 461 return csb.io.Pickle.load(stream)
462 463 @property
464 - def _children(self):
465 return self._residues
466 467 @property
468 - def errors(self):
469 return self._errors
470 471 @property
472 - def id(self):
473 return self._id
474 475 @property
476 - def accession(self):
477 return self._accession
478 479 @property
480 - def chain_id(self):
481 return self._chain_id
482 483 @property
484 - def max_overlap(self):
485 return self._overlap
486 487 @property
488 - def length(self):
489 return self._length
490 491 @property
492 - def sequence(self):
493 return ''.join(str(r.native.type) for r in self)
494 495 @property
496 - def matches(self):
497 return self._assignments
498 499 @property
500 - def residues(self):
501 return self._residues
502 503 @property
504 - def segments(self):
505 return self._segments
506
507 - def assign(self, fragment):
508 """ 509 Add a new fragment match. 510 @param fragment: fragment to assign 511 @type fragment: L{Assignment} 512 """ 513 514 if not 1 <= fragment.qstart <= fragment.qend <= len(self._residues): 515 raise ValueError("Fragment out of range") 516 517 self._assignments._append_item(fragment) 518 519 for rank in range(fragment.qstart, fragment.qend + 1): 520 ai = ResidueAssignmentInfo(fragment, rank) 521 self._residues[rank].assign(ai) 522 523 if fragment.segment is not None: 524 try: 525 self._segments[fragment.segment].assign(fragment) 526 except KeyError: 527 raise ValueError("Undefined segment starting at {0}".format(fragment.segment))
528
529 - def assignall(self, fragments):
530 """ 531 Assign a bunch of fragments at once. 532 @type fragments: iterable of L{Assignment}s 533 """ 534 for frag in fragments: 535 self.assign(frag)
536
537 - def filter(self, threshold=1.5, extend=False):
538 """ 539 Filter the current fragment map using a L{FragmentCluster}. 540 541 @param threshold: cluster RMSD threshold (see L{FragmentCluster}) 542 @type threshold: float 543 @param extend: pick extended alternatives where possible (default=False) 544 @type extend: bool 545 546 @return: a new target, containing only cluster centroids/reps 547 @rtype: L{Target} 548 """ 549 550 target = self.clone() 551 552 for residue in self.residues: 553 rep = residue.filter(threshold=threshold, extend=extend) 554 555 if rep is not None: 556 target.assign(rep.centroid) 557 558 return target
559
560 - def clone(self):
561 """ 562 @return: a deep copy of the target 563 @rtype: L{Target} 564 """ 565 566 segments = [self.segments[start] for start in self.segments] 567 segments = [TargetSegment(s.start, s.end, s.count) for s in segments] 568 569 target = self._factory.target(self.id, self.length, [r.native for r in self.residues], 570 overlap=self._overlap, segments=segments) 571 572 return target
573
574 575 -class TargetResidue(object):
576 """ 577 Wrapper around L{Target}'s native residues. Decorates them with additional, 578 fragment-related methods. 579 580 @type native_residue: L{csb.bio.structure.ProteinResidue} 581 """ 582
583 - def __init__(self, native_residue):
584 585 self._type = native_residue.type 586 self._native = native_residue.clone() 587 self._assignments = csb.core.ReadOnlyCollectionContainer(type=ResidueAssignmentInfo)
588 589 @property
590 - def type(self):
591 return self._type
592 593 @property
594 - def native(self):
595 return self._native
596 597 @property
598 - def assignments(self):
599 return self._assignments
600
601 - def assign(self, assignment_info):
602 self._assignments._append_item(assignment_info)
603
604 - def verybest(self):
605 """ 606 @return: the fragment with the lowest RMSD at this position in the L{Target} 607 @rtype: L{Assignment} 608 """ 609 610 best = None 611 612 for ai in self.assignments: 613 a = ai.fragment 614 if a.length < FragmentCluster.MIN_LENGTH: 615 continue 616 if best is None or a.rmsd < best.rmsd: 617 best = a 618 elif a.rmsd == best.rmsd and a.length > best.length: 619 best = a 620 621 return best
622
623 - def filter(self, method=Metrics.RMSD, threshold=1.5, extend=False):
624 """ 625 Filter all fragments, covering this position in the L{Target} using a 626 L{FragmentCluster}. 627 628 @param method: one of the L{Metrics} members (default=L{Metrics.RMSD}) 629 @type method: str 630 @param threshold: cluster RMSD threshold (see L{FragmentCluster}) 631 @type threshold: float 632 @param extend: pick extended alternative where possible (default=False) 633 @type extend: bool 634 635 @return: cluster's representative (if converged) or None 636 @rtype: L{ClusterRep} or None 637 """ 638 639 try: 640 nodes = [] 641 for ai in self.assignments: 642 node = ClusterNode.create(ai.fragment, method, extend) 643 nodes.append(node) 644 645 cluster = FragmentCluster(nodes, threshold=threshold) 646 647 center = cluster.shrink(minitems=0) 648 if center.has_alternative: 649 center.exchange() 650 651 return center 652 653 except (ClusterExhaustedError, ClusterDivergingError): 654 return None
655
656 - def longest(self):
657 """ 658 @return: the longest fragment, covering the current position 659 @rtype: L{Assignment} 660 """ 661 best = None 662 663 for q in self.assignments: 664 if best is None or (q.fragment.length > best.length): 665 best = q.fragment 666 667 return best
668
669 - def precision(self, threshold=1.5):
670 """ 671 @return: the residue-wise precision of the fragment library at the 672 current position (percentage). 673 674 @param threshold: true-positive RMSD cutoff (default=1.5) 675 @type threshold: float 676 @rtype: float 677 """ 678 679 if self.assignments.length < 1: 680 return None 681 else: 682 positive = [a for a in self.assignments if a.fragment.rmsd <= threshold] 683 pos = len(positive) * 100.0 / self.assignments.length 684 685 return pos
686
687 -class TargetSegment(object):
688
689 - def __init__(self, start, end, count):
690 691 self._start = start 692 self._end = end 693 self._count = count 694 695 self._assignments = csb.core.ReadOnlyCollectionContainer(type=Assignment)
696 697 @property
698 - def count(self):
699 return self._count
700 701 @property
702 - def start(self):
703 return self._start
704 705 @property
706 - def end(self):
707 return self._end
708 709 @property
710 - def length(self):
711 return (self._end - self._start + 1)
712 713 @property
714 - def assignments(self):
715 return self._assignments
716
717 - def assign(self, fragment):
718 if fragment.segment != self.start: 719 raise ValueError('Segment origin mismatch: {0} vs {1}'.format(fragment.segment, self.start)) 720 else: 721 self._assignments._append_item(fragment)
722
723 - def verybest(self):
724 725 best = None 726 727 for a in self.assignments: 728 if a.length < FragmentCluster.MIN_LENGTH: 729 continue 730 if best is None or a.rmsd < best.rmsd: 731 best = a 732 elif a.rmsd == best.rmsd and a.length > best.length: 733 best = a 734 735 return best
736
737 - def best(self, method=Metrics.RMSD):
738 739 try: 740 cluster = FragmentCluster(self.assignments, threshold=1.5, 741 connectedness=0.5, method=method) 742 centroid = cluster.shrink(minitems=1) 743 return centroid 744 745 except ClusterExhaustedError: 746 return None 747 finally: 748 del cluster
749
750 - def longest(self):
751 752 best = None 753 754 for q in self.assignments: 755 if best is None or (q.length > best.length): 756 best = q 757 758 return best
759
760 - def pairwise_rmsd(self, min_overlap=5):
761 762 rmsds = [] 763 764 for q in self.assignments: 765 for s in self.assignments: 766 if q is not s: 767 r = q.rmsd_to(s, min_overlap) 768 if r is not None: 769 rmsds.append(r) 770 else: 771 assert q.rmsd_to(s, 1) < 0.01 772 773 return rmsds
774
775 - def pairwise_mda(self, min_overlap=5):
776 777 mdas = [] 778 779 for q in self.assignments: 780 for s in self.assignments: 781 if q is not s: 782 m = q.mda_to(s, min_overlap) 783 if m is not None: 784 mdas.append(m) 785 return mdas
786
787 - def pairwise_sa_rmsd(self, profiles='.', min_overlap=5):
788 789 from csb.bio.hmm import RELATIVE_SA 790 from csb.bio.io.hhpred import ScoreUnits, HHProfileParser 791 792 def convert_sa(sa): 793 return numpy.array([ RELATIVE_SA[i] for i in sa ])
794 795 sources = {} 796 scores = [] 797 798 for q in self.assignments: 799 for s in self.assignments: 800 801 if s.source_id not in sources: 802 hmm = HHProfileParser(os.path.join(profiles, s.source_id + '.hhm')).parse() 803 sources[s.source_id] = hmm.dssp_solvent 804 805 if q is not s: 806 807 common = q.overlap(s) 808 if len(common) >= min_overlap: 809 810 qsa = q.solvent_at(sources[q.source_id], min(common), max(common)) 811 ssa = s.solvent_at(sources[s.source_id], min(common), max(common)) 812 813 if '-' in qsa + ssa: 814 continue 815 816 qsa = convert_sa(qsa) 817 ssa = convert_sa(ssa) 818 assert len(qsa) == len(ssa) 819 sa_rmsd = numpy.sqrt(numpy.sum((qsa - ssa) ** 2) / float(len(qsa))) 820 821 scores.append(sa_rmsd) 822 return scores
823
824 - def pairwise_scores(self, profiles='.', min_overlap=5):
825 826 from csb.bio.hmm import BACKGROUND 827 back = numpy.sqrt(numpy.array(BACKGROUND)) 828 829 sources = {} 830 scores = [] 831 832 for q in self.assignments: 833 for s in self.assignments: 834 835 if s.source_id not in sources: 836 # hmm = HHProfileParser(os.path.join(hmm_path, s.source_id + '.hhm')).parse(ScoreUnits.Probability) 837 sources[s.source_id] = csb.io.Pickle.load(open(os.path.join(profiles, s.source_id + '.pkl'), 'rb')) 838 839 if q is not s: 840 841 common = q.overlap(s) 842 if len(common) >= min_overlap: 843 844 qprof = q.profile_at(sources[q.source_id], min(common), max(common)) 845 sprof = s.profile_at(sources[s.source_id], min(common), max(common)) 846 847 #score = qhmm.emission_similarity(shmm) 848 assert len(qprof) == len(sprof) 849 dots = [ numpy.dot(qprof[i] / back, sprof[i] / back) for i in range(len(qprof)) ] 850 score = numpy.log(numpy.prod(dots)) 851 if score is not None: 852 scores.append(score) 853 return scores
854
855 - def _entropy(self, data, binsize):
856 857 binsize = float(binsize) 858 bins = numpy.ceil(numpy.array(data) / binsize) 859 860 hist = dict.fromkeys(bins, 0) 861 for bin in bins: 862 hist[bin] += (1.0 / len(bins)) 863 864 freq = numpy.array(hist.values()) 865 return - numpy.sum(freq * numpy.log(freq))
866
867 - def rmsd_entropy(self, binsize=0.1):
868 869 rmsds = self.pairwise_rmsd() 870 return self._entropy(rmsds, binsize)
871
872 - def score_entropy(self, profiles='.', binsize=1):
873 874 scores = self.pairwise_scores(profiles) 875 return self._entropy(scores, binsize)
876
877 - def rmsd_consistency(self, threshold=1.5):
878 879 rmsds = self.pairwise_rmsd() 880 881 if len(rmsds) < 1: 882 return None 883 884 return sum([1 for i in rmsds if i <= threshold]) / float(len(rmsds))
885
886 - def sa_rmsd_consistency(self, threshold=0.4, profiles='.'):
887 888 sa_rmsds = self.pairwise_sa_rmsd(profiles=profiles) 889 890 if len(sa_rmsds) < 1: 891 return None 892 893 return sum([1 for i in sa_rmsds if i <= threshold]) / float(len(sa_rmsds))
894
895 - def true_positives(self, threshold=1.5):
896 897 if self.assignments.length < 1: 898 return None 899 900 return sum([1 for i in self.assignments if i.rmsd <= threshold]) / float(self.assignments.length)
901
902 - def confidence(self):
903 904 cons = self.rmsd_consistency() 905 906 if cons is None: 907 return 0 908 else: 909 return numpy.log10(self.count) * cons
910
911 -class ResidueAssignmentInfo(object):
912
913 - def __init__(self, assignment, rank):
914 915 if not assignment.qstart <= rank <= assignment.qend: 916 raise ValueError('Rank {0} is not matched by this assignment') 917 918 self._assignment = assignment 919 self._rank = rank 920 self._relrank = rank - assignment.qstart
921 922 @property
923 - def c_alpha(self):
924 return self._assignment.backbone[self._relrank]
925 926 @property
927 - def fragment(self):
928 return self._assignment
929
930 -class Assignment(FragmentMatch):
931 """ 932 Represents a match between a fragment and its target. 933 934 @param source: source structure (must have torsion angles precomputed) 935 @type source: L{csb.bio.structure.Chain} 936 @param start: start position in C{source} (rank) 937 @type start: int 938 @param end: end position in C{source} (rank) 939 @type end: int 940 @param id: fragment ID 941 @type id: str 942 @param qstart: start position in target (rank) 943 @type qstart: int 944 @param qend: end position in target (rank) 945 @type qend: int 946 @param probability: probability of assignment 947 @type probability: float 948 @param rmsd: RMSD of the fragment, compared to target's native structure 949 @type rmsd: float 950 """ 951
952 - def __init__(self, source, start, end, id, qstart, qend, probability, rmsd, tm_score=None, 953 score=None, neff=None, segment=None, internal_id=None):
954 955 assert source.has_torsion 956 sub = source.subregion(start, end, clone=True) 957 try: 958 calpha = [r.atoms['CA'].vector.copy() for r in sub.residues] 959 except csb.core.ItemNotFoundError: 960 raise csb.bio.structure.Broken3DStructureError() 961 torsion = [r.torsion.copy() for r in sub.residues] 962 963 self._calpha = csb.core.ReadOnlyCollectionContainer(items=calpha, type=numpy.ndarray) 964 self._torsion = torsion 965 self._sequence = sub.sequence 966 967 self._source_id = source.accession[:4] + source.id 968 self._start = start 969 self._end = end 970 971 self._score = score 972 self._neff = neff 973 self._ss = None 974 975 self._segment_start = segment 976 self.internal_id = internal_id 977 978 super(Assignment, self).__init__(id, qstart, qend, probability, rmsd, tm_score, None) 979 980 self._ss = SecondaryStructure('-' * self.length)
981 982 @staticmethod
983 - def from_fragment(fragment, provider):
984 """ 985 Create a new L{Assignment} given a source rosetta fragment. 986 987 @param fragment: rosetta fragment 988 @type fragment: L{RosettaFragment} 989 @param provider: PDB database provider 990 @type provider: L{StructureProvider} 991 992 @rtype: L{Assignment} 993 """ 994 structure = provider.get(fragment.accession) 995 source = structure.chains[fragment.chain] 996 source.compute_torsion() 997 998 id = "{0}:{1}-{2}".format(fragment.source_id, fragment.start, fragment.end) 999 1000 return Assignment(source, fragment.start, fragment.end, id, 1001 fragment.qstart, fragment.qend, 0, 0)
1002 1003 @property
1004 - def backbone(self):
1005 return self._calpha
1006 1007 @property
1008 - def sequence(self):
1009 return self._sequence
1010 1011 @property
1012 - def torsion(self):
1013 return self._torsion
1014 1015 @property
1016 - def source_id(self):
1017 return self._source_id
1018 1019 @property
1020 - def start(self):
1021 return self._start
1022 1023 @property
1024 - def end(self):
1025 return self._end
1026 1027 @property
1028 - def score(self):
1029 return self._score
1030 1031 @property
1032 - def neff(self):
1033 return self._neff
1034 1035 @property
1036 - def segment(self):
1037 return self._segment_start
1038 1039 @property
1040 - def secondary_structure(self):
1041 return self._ss
1042 @secondary_structure.setter
1043 - def secondary_structure(self, value):
1044 1045 if isinstance(value, csb.core.string): 1046 value = csb.bio.structure.SecondaryStructure(value) 1047 if len(str(value)) != self.length:#(value.end - value.start + 1) != self.length: 1048 raise ValueError("Invalid secondary structure length", len(str(value)), self.length ) 1049 1050 self._ss = value
1051
1052 - def transform(self, rotation, translation):
1053 """ 1054 Apply rotation/translation to fragment's coordinates in place. 1055 """ 1056 1057 for ca in self.backbone: 1058 newca = numpy.dot(ca, numpy.transpose(rotation)) + translation 1059 for i in range(3): 1060 ca[i] = newca[i]
1061
1062 - def _check_range(self, qstart, qend):
1063 1064 if not (self.qstart <= qstart <= qend <= self.qend): 1065 raise ValueError('Region {0}..{1} is out of range {2.qstart}..{2.qend}'.format(qstart, qend, self))
1066
1067 - def anchored_around(self, rank):
1068 """ 1069 @return: True if the fragment is centered around position=C{rank}. 1070 @rtype: bool 1071 """ 1072 1073 if self.qstart < rank < self.qend: 1074 if (rank - self.qstart + 1) > 0.4 * (self.qend - self.qstart + 1): 1075 return True 1076 1077 return False
1078
1079 - def backbone_at(self, qstart, qend):
1080 """ 1081 @return: the CA coordinates of the fragment at the specified subregion. 1082 @rtype: list 1083 """ 1084 1085 self._check_range(qstart, qend) 1086 1087 relstart = qstart - self.qstart 1088 relend = qend - self.qstart + 1 1089 1090 return self.backbone[relstart : relend]
1091
1092 - def torsion_at(self, qstart, qend):
1093 """ 1094 @return: the torsion angles of the fragment at the specified subregion. 1095 @rtype: list 1096 """ 1097 1098 self._check_range(qstart, qend) 1099 1100 relstart = qstart - self.qstart 1101 relend = qend - self.qstart + 1 1102 1103 return self.torsion[relstart : relend]
1104
1105 - def solvent_at(self, sa_string, qstart, qend):
1106 1107 self._check_range(qstart, qend) 1108 1109 relstart = qstart - self.qstart 1110 relend = qend - self.qstart + 1 1111 1112 return sa_string[relstart : relend]
1113
1114 - def sec_structure_at(self, qstart, qend):
1115 1116 self._check_range(qstart, qend) 1117 start = qstart - self.qstart + 1 1118 end = qend - self.qstart + 1 1119 1120 return self.secondary_structure.scan(start, end, loose=True, cut=True)
1121
1122 - def profile_at(self, source, qstart, qend):
1123 1124 self._check_range(qstart, qend) 1125 1126 start = qstart - self.qstart + self.start 1127 end = qend - self.qstart + self.start 1128 1129 if hasattr(source, 'subregion'): 1130 return source.subregion(start, end) 1131 else: 1132 return source[start - 1 : end]
1133
1134 - def chain_at(self, source, qstart, qend):
1135 1136 self._check_range(qstart, qend) 1137 1138 start = qstart - self.qstart + self.start 1139 end = qend - self.qstart + self.start 1140 1141 return source.subregion(start, end)
1142
1143 - def overlap(self, other):
1144 """ 1145 @type other: L{Assignment} 1146 @return: target positions, covered by both C{self} and C{other} 1147 @rtype: set of int 1148 """ 1149 1150 qranks = set(range(self.qstart, self.qend + 1)) 1151 sranks = set(range(other.qstart, other.qend + 1)) 1152 1153 return qranks.intersection(sranks)
1154
1155 - def rmsd_to(self, other, min_overlap=5):
1156 """ 1157 @return: the CA RMSD between C{self} and C{other}. 1158 1159 @param other: another fragment 1160 @type other: L{Assignment} 1161 @param min_overlap: require at least that number of overlapping residues 1162 (return None if not satisfied) 1163 @type min_overlap: int 1164 1165 @rtype: float 1166 """ 1167 1168 common = self.overlap(other) 1169 1170 if len(common) >= min_overlap: 1171 1172 qstart, qend = min(common), max(common) 1173 1174 q = self.backbone_at(qstart, qend) 1175 s = other.backbone_at(qstart, qend) 1176 1177 if len(q) > 0 and len(s) > 0: 1178 return csb.bio.utils.rmsd(numpy.array(q), numpy.array(s)) 1179 1180 return None
1181
1182 - def nrmsd_to(self, other, min_overlap=5):
1183 1184 common = self.overlap(other) 1185 1186 if len(common) >= min_overlap: 1187 1188 qstart, qend = min(common), max(common) 1189 1190 q = self.backbone_at(qstart, qend) 1191 s = other.backbone_at(qstart, qend) 1192 1193 if len(q) > 0 and len(s) > 0: 1194 return csb.bio.utils.rmsd(q, s) / RANDOM_RMSD[ len(common) ] 1195 1196 return None
1197
1198 - def mda_to(self, other, min_overlap=5):
1199 1200 common = self.overlap(other) 1201 1202 if len(common) >= min_overlap: 1203 1204 qstart, qend = min(common), max(common) 1205 1206 q = self.torsion_at(qstart, qend) 1207 s = other.torsion_at(qstart, qend) 1208 1209 if len(q) > 0 and len(s) > 0: 1210 1211 maxphi = max(numpy.abs(i.phi - j.phi) for i, j in zip(q, s)[1:]) # phi: 2 .. L 1212 maxpsi = max(numpy.abs(i.psi - j.psi) for i, j in zip(q, s)[:-1]) # psi: 1 .. L-1 1213 1214 return max(maxphi, maxpsi) 1215 1216 return None
1217
1218 - def to_rosetta(self, source, qstart=None, qend=None, weight=None):
1219 """ 1220 @deprecated: this method will be deleted soon. Use 1221 L{csb.bio.fragments.rosetta.OutputBuilder} instead. 1222 """ 1223 stream = csb.io.MemoryStream() 1224 1225 if weight is None: 1226 weight = self.probability 1227 if not qstart: 1228 qstart = self.qstart 1229 if not qend: 1230 qend = self.qend 1231 1232 source.compute_torsion() 1233 chain = self.chain_at(source, qstart, qend) 1234 1235 for i, r in enumerate(chain.residues): 1236 1237 acc = self.source_id[:4] 1238 ch = self.source_id[4].upper() 1239 1240 start = qstart - self.qstart + self.start + i 1241 aa = r.type 1242 ss = 'L' 1243 phi, psi, omega = 0, 0, 0 1244 if r.torsion.phi: 1245 phi = r.torsion.phi 1246 if r.torsion.psi: 1247 psi = r.torsion.psi 1248 if r.torsion.omega: 1249 omega = r.torsion.omega 1250 1251 stream.write(' {0:4} {1:1} {2:>5} {3!s:1} {4!s:1} {5:>8.3f} {6:>8.3f} {7:>8.3f} {8:>8.3f}\n'.format(acc, ch, start, aa, ss, phi, psi, omega, weight)) 1252 1253 return stream.getvalue()
1254
1255 -class ClusterExhaustedError(ValueError):
1256 pass
1257
1258 -class ClusterEmptyError(ClusterExhaustedError):
1259 pass
1260
1261 -class ClusterDivergingError(RuntimeError):
1262 pass
1263
1264 -class FragmentCluster(object):
1265 """ 1266 Provides clustering/filtering of the fragments, covering a common residue 1267 in the target. Clustering is done via iterative shrinking of the cluster. 1268 At each iteration, node rejection (deletion) is attempted for each node. The 1269 node rejection, causing the most significant drop in the average pairwise 1270 distance (RMSD) in the cluster, is retained. This procedure is repeated 1271 until: 1) the average pairwise RMSD drops below the C{threshold} (converged), 1272 2) the cluster gets exhausted or 3) node rejection no longer 1273 causes a drop in the average distance (not converging). 1274 1275 @param items: cluster members 1276 @type items: iterable of L{ClusterNode}s 1277 @param threshold: RMSD threshold; continue shrinking until the mean distance 1278 drops below this value (default=1.5) 1279 @type threshold: float 1280 @param connectedness: use only nodes which are connected to at least c% of all 1281 initial nodes (default=0.5, that means 50%) 1282 @type connectedness: float 1283 """ 1284 1285 MIN_LENGTH = 6 1286
1287 - def __init__(self, items, threshold=1.5, connectedness=0.5):
1288 1289 items = set(i for i in items if i.fragment.length >= FragmentCluster.MIN_LENGTH) 1290 1291 self._matrix = {} 1292 self._threshold = float(threshold) 1293 self._connectedness = float(connectedness) 1294 1295 for i in items: 1296 1297 self._matrix[i] = {} 1298 conn = 0.0 1299 1300 for j in items: 1301 distance = i.distance(j) 1302 if distance is not None: 1303 conn += 1 1304 self._matrix[i][j] = distance 1305 1306 if conn / len(items) < self.connectedness: 1307 # reject i as a first class node 1308 del self._matrix[i] 1309 1310 self._items = set(self._matrix.keys()) 1311 1312 if len(self._items) < 1: 1313 raise ClusterEmptyError() 1314 1315 self._initcount = self.count
1316 1317 @property
1318 - def count(self):
1319 return len(self._items)
1320 1321 @property
1322 - def items(self):
1323 return tuple(self._items)
1324 1325 @property
1326 - def fragments(self):
1327 return tuple(i.fragment for i in self._items)
1328 1329 @property
1330 - def threshold(self):
1331 return self._threshold
1332 @threshold.setter
1333 - def threshold(self, value):
1334 self._threshold = float(value)
1335 1336 @property
1337 - def connectedness(self):
1338 return self._connectedness
1339
1340 - def _distances(self, skip=None):
1341 1342 d = [] 1343 1344 for i in self._matrix: 1345 if skip is i: 1346 continue 1347 1348 for j in self._matrix[i]: 1349 if skip is not j: 1350 d.append(self._matrix[i][j]) 1351 1352 return d
1353
1354 - def _distance(self, i, j):
1355 1356 if j in self._matrix[i]: 1357 return self._matrix[i][j] 1358 else: 1359 return None
1360
1361 - def mean(self, skip=None):
1362 """ 1363 @return: the current mean distance in the cluster 1364 @rtype: float 1365 """ 1366 1367 d = self._distances(skip=skip) 1368 1369 if len(d) > 0: 1370 return numpy.mean(d) 1371 else: 1372 raise ClusterExhaustedError()
1373
1374 - def centroid(self):
1375 """ 1376 @return: the current representative fragment 1377 @rtype: L{ClusterRep} 1378 1379 @note: the cluster rep is the node with the lowest average distance 1380 to all other nodes. If a fixed fragment exists, structurally similar 1381 to the rep, but longer, this fragment may be suggested as an alternative 1382 (see also L{ClusterRep}). 1383 """ 1384 1385 alt = None 1386 cen = None 1387 avg = None 1388 1389 for i in self._matrix: 1390 1391 curravg = numpy.mean(list(self._matrix[i].values())) 1392 1393 if avg is None or curravg < avg: 1394 avg = curravg 1395 cen = i 1396 elif curravg == avg: 1397 if i.fragment.length > cen.fragment.length: 1398 cen = i 1399 1400 d = self._distances() 1401 mean = numpy.mean(d) 1402 cons = sum(1.0 for i in d if i <= self.threshold) / len(d) 1403 1404 for i in self._matrix: 1405 if i is not cen and i.fixed and i.fragment.length > cen.fragment.length: 1406 distance = self._distance(i, cen) 1407 if distance is not None and distance < 0.5 * self.threshold: 1408 if alt is None or alt.fragment.length < i.fragment.length: 1409 alt = i 1410 1411 return ClusterRep(cen, mean, cons, len(self._matrix[cen]), alternative=alt, 1412 rejections=(self._initcount - self.count))
1413
1414 - def reject(self, item):
1415 """ 1416 Remove C{item} from the cluster. 1417 1418 @type item: L{ClusterNode} 1419 @raise ClusterExhaustedError: if this is the last remaining item 1420 """ 1421 1422 if self.count == 1: 1423 raise ClusterExhaustedError() 1424 1425 assert not item.fixed 1426 1427 for i in self._matrix: 1428 if item in self._matrix[i]: 1429 del self._matrix[i][item] 1430 1431 del self._matrix[item] 1432 self._items.remove(item)
1433
1434 - def shrinkone(self):
1435 """ 1436 Shrink the cluster by a single node. 1437 1438 @return: True on successful shrink, False otherwise (e.g. if 1439 already converged) 1440 @rtype: bool 1441 @raise ClusterExhaustedError: if exhausted 1442 @raise ClusterDivergingError: if not converging 1443 """ 1444 1445 mean = self.mean() 1446 if mean <= self.threshold or self.count == 1: 1447 return False # already shrunk enough 1448 1449 m = {} 1450 1451 for i in self._matrix: 1452 if not i.fixed: 1453 newmean = self.mean(skip=i) 1454 m[newmean] = i 1455 1456 if len(m) == 0: # only fixed items remaining 1457 raise ClusterExhaustedError() 1458 1459 newmean = min(m) 1460 1461 if newmean > mean: 1462 raise ClusterDivergingError() # can't converge, usually when fixed items are too far away from the average 1463 elif newmean < mean: 1464 junk = m[newmean] 1465 self.reject(junk) 1466 return True # successful shrink 1467 else: 1468 return False # converged
1469
1470 - def shrink(self, minitems=2):
1471 """ 1472 Start automatic shrinking. 1473 1474 @param minitems: absolute minimum of the number of nodes in the cluster 1475 @type minitems: int 1476 1477 @return: cluster's representative: the node with the lowest average 1478 distance to all other nodes in the cluster 1479 @rtype: L{ClusterRep} 1480 1481 @raise ClusterExhaustedError: if C{self.count} < C{minitems} and 1482 still not converged 1483 """ 1484 1485 if self.count > minitems: 1486 1487 while self.shrinkone(): 1488 if self.count <= minitems: 1489 raise ClusterExhaustedError() 1490 else: 1491 raise ClusterExhaustedError() 1492 1493 return self.centroid()
1494
1495 -class ClusterNode(object):
1496 """ 1497 Cluster node. 1498 1499 @param fragment: fragment 1500 @type fragment: L{Assignment} 1501 @param distance: distance metric (a L{Metrics} member, default is RMSD) 1502 @type distance: str 1503 @param fixed: mark this node as fixed (cannot be rejected) 1504 @type fixed: bool 1505 """ 1506 1507 FIXED = 0.7 1508 1509 @staticmethod
1510 - def create(fragment, method=Metrics.RMSD, extend=False):
1511 """ 1512 Create a new L{ClusterNode} given a specified C{Assignment}. If this 1513 assignment is a high probability match, define it as a fixed fragment. 1514 1515 @rtype: L{ClusterNode} 1516 """ 1517 if fragment.probability > ClusterNode.FIXED and fragment.length >= FragmentCluster.MIN_LENGTH: 1518 return ClusterNode(fragment, distance=method, fixed=extend) 1519 else: 1520 return ClusterNode(fragment, distance=method, fixed=False)
1521
1522 - def __init__(self, fragment, distance=Metrics.RMSD, fixed=False):
1523 1524 if fixed and fragment.length < FragmentCluster.MIN_LENGTH: 1525 raise ValueError("Can't fix a short fragment") 1526 1527 self.fragment = fragment 1528 self.fixed = bool(fixed) 1529 1530 self._distance = getattr(self.fragment, distance)
1531
1532 - def distance(self, other):
1533 """ 1534 @return: the distance between self and another node 1535 @type other: L{ClusterNode} 1536 @rtype: float 1537 """ 1538 return self._distance(other.fragment)
1539
1540 -class ClusterRep(object):
1541 """ 1542 Cluster's representative (centroid) node. This object carries the 1543 result of shrinking itself. 1544 1545 @param centroid: rep node 1546 @type centroid: L{ClusterNode} 1547 @param mean: current mean distance in the cluster 1548 @type mean: float 1549 @param consistency: percentage of pairwise distances below the RMSD C{threshold} 1550 @type consistency: float 1551 @param count: current number of nodes in the cluster 1552 @type count: int 1553 @param rejections: total number of rejections 1554 @type rejections: int 1555 @param alternative: suggested cluster rep alternative (e.g. structurally 1556 similar to the centroid, but longer) 1557 @type param: 1558 """ 1559
1560 - def __init__(self, centroid, mean, consistency, count, rejections=0, alternative=None):
1561 1562 if isinstance(centroid, ClusterNode): 1563 centroid = centroid.fragment 1564 if isinstance(alternative, ClusterNode): 1565 alternative = alternative.fragment 1566 1567 self._centroid = centroid 1568 self._alternative = alternative 1569 self._mean = mean 1570 self._consistency = consistency 1571 self._count = count 1572 self._rejections = rejections
1573 1574 @property
1575 - def confidence(self):
1576 """ 1577 Confidence of assignment: log10(count) * consistency 1578 """ 1579 if self.count <= 0 or self.count is None or self.consistency is None: 1580 return 0 1581 else: 1582 return numpy.log10(self.count) * self.consistency
1583 1584 @property
1585 - def centroid(self):
1586 return self._centroid
1587 1588 @property
1589 - def alternative(self):
1590 return self._alternative
1591 1592 @property
1593 - def has_alternative(self):
1594 return self._alternative is not None
1595 1596 @property
1597 - def mean(self):
1598 return self._mean
1599 1600 @property
1601 - def consistency(self):
1602 return self._consistency
1603 1604 @property
1605 - def count(self):
1606 return self._count
1607 1608 @property
1609 - def rejections(self):
1610 return self._rejections
1611
1612 - def exchange(self):
1613 """ 1614 If an alternative is available, swap the centroid and the alternative. 1615 """ 1616 1617 if self._alternative is not None: 1618 1619 centroid = self._centroid 1620 self._centroid = self._alternative 1621 self._alternative = centroid
1622
1623 - def to_rosetta(self, source):
1624 """ 1625 @deprecated: this method is obsolete and will be deleted soon 1626 """ 1627 return self.centroid.to_rosetta(source, weight=self.confidence)
1628
1629 -class AdaptedAssignment(object):
1630 1631 @staticmethod
1632 - def with_overhangs(center, start, end, overhang=1):
1633 1634 if center.centroid.qstart <= (start - overhang): 1635 start -= overhang 1636 elif center.centroid.qstart < start: 1637 start = center.centroid.qstart 1638 1639 if center.centroid.qend >= (end + overhang): 1640 end += overhang 1641 elif center.centroid.qend > end: 1642 end = center.centroid.end 1643 1644 return AdaptedAssignment(center, start, end)
1645
1646 - def __init__(self, center, qstart, qend):
1647 1648 if qstart < center.centroid.qstart: 1649 raise ValueError(qstart) 1650 if qend > center.centroid.qend: 1651 raise ValueError(qend) 1652 1653 self._qstart = qstart 1654 self._qend = qend 1655 self._center = center
1656 1657 @property
1658 - def fragment(self):
1659 return self._center.centroid
1660 1661 @property
1662 - def center(self):
1663 return self._center
1664 1665 @property
1666 - def confidence(self):
1667 return self._center.confidence
1668 1669 @property
1670 - def qstart(self):
1671 return self._qstart
1672 1673 @property
1674 - def qend(self):
1675 return self._qend
1676 1677 @property
1678 - def backbone(self):
1679 return self.fragment.backbone_at(self.qstart, self.qend)
1680
1681 - def chain(self, source):
1682 return self.fragment.chain_at(source, self.qstart, self.qend)
1683
1684 - def to_rosetta(self, source):
1685 return self.fragment.to_rosetta(source, self.qstart, self.qend, self.confidence)
1686
1687 -class SmoothFragmentMap(csb.core.AbstractContainer):
1688
1689 - def __init__(self, length, centroids):
1690 1691 if not length > 0: 1692 raise ValueError(length) 1693 1694 self._length = int(length) 1695 self._slots = set(range(1, self._length + 1)) 1696 self._map = {} 1697 1698 centers = list(centroids) 1699 centers.sort(key=lambda i: i.confidence, reverse=True) 1700 1701 for c in centers: 1702 self.assign(c)
1703 1704 @property
1705 - def _children(self):
1706 return self._map
1707
1708 - def assign(self, center):
1709 1710 for r in range(center.centroid.qstart, center.centroid.qend + 1): 1711 if r in self._slots: 1712 self._map[r] = center 1713 self._slots.remove(r)
1714
1715 - def patches(self):
1716 1717 center = None 1718 start = None 1719 end = None 1720 1721 for r in range(1, self._length + 1): 1722 1723 if center is None: 1724 if r in self._map: 1725 center = self._map[r] 1726 start = end = r 1727 else: 1728 center = None 1729 start = end = None 1730 else: 1731 if r in self._map: 1732 if self._map[r] is center: 1733 end = r 1734 else: 1735 yield AdaptedAssignment(center, start, end) 1736 center = self._map[r] 1737 start = end = r 1738 else: 1739 yield AdaptedAssignment(center, start, end) 1740 center = None 1741 start = end = None
1742
1743 1744 -class ResidueEventInfo(object):
1745
1746 - def __init__(self, rank, confidence=None, count=None, confident=True, rep=None):
1747 1748 self.rank = rank 1749 self.confidence = confidence 1750 self.confident = confident 1751 self.count = count 1752 self.rep = rep
1753
1754 -class RosettaFragsetFactory(object):
1755 """ 1756 Simplifies the construction of fragment libraries. 1757 """ 1758
1759 - def __init__(self):
1760 import csb.bio.fragments.rosetta as rosetta 1761 self.rosetta = rosetta
1762
1763 - def make_fragset(self, target):
1764 """ 1765 Build a fragment library given a L{Target} and its L{Assignment}s. 1766 1767 @param target: target protein 1768 @type target: L{Target} 1769 1770 @rtype: L{RosettaFragmentMap} 1771 """ 1772 1773 frag_factory = self.rosetta.RosettaFragment 1774 fragments = list(map(frag_factory.from_object, target.matches)) 1775 #fragments = [ frag_factory.from_object(f) for f in target.matches if f.length >= 6 ] 1776 fragments.sort() 1777 1778 return self.rosetta.RosettaFragmentMap(fragments, target.length)
1779
1780 - def make_chopped(self, fragments, window):
1781 """ 1782 Build a fixed-length fragment library from a list of 1783 variable-length L{Assignment}s. 1784 1785 @param fragments: source fragments 1786 @type fragments: iterable of L{RosettaFragment}s 1787 @param window: fixed-length fragment size (for classic Rosetta: choose 9) 1788 @type window: int 1789 1790 @return: fixed-length fragment library 1791 @rtype: L{RosettaFragmentMap} 1792 """ 1793 1794 frags = [] 1795 1796 for f in fragments: 1797 for qs in range(f.qstart, f.qend - window + 1): 1798 frags.append(f.subregion(qs, qs + window - 1)) 1799 1800 return self.rosetta.RosettaFragmentMap(frags)
1801
1802 - def make_combined(self, target, filling, threshold=0.5, callback=None):
1803 """ 1804 Complement C{target}'s assignments with C{filling} (e.g. rosetta fragments). 1805 The regions to be complemented are determined by calculating the confidence 1806 at each residue (by filtering). 1807 1808 1809 @param target: target protein 1810 @type target: L{Target} 1811 @param filling: additional fragments to place in the low-conf regions 1812 @type filling: L{RosettaFragmentMap} or iterable of L{RosettaFragment} 1813 @param threshold: confidence threshold 1814 @type threshold: float 1815 1816 @return: complemented fragment library 1817 @rtype: L{RosettaFragmentMap} 1818 """ 1819 1820 fragmap = self.make_fragset(target) 1821 covered = set() 1822 1823 for r in target.residues: 1824 1825 if r.assignments.length == 0: 1826 if callback: 1827 callback(ResidueEventInfo(r.native.rank, None, 0, False)) 1828 continue 1829 1830 cluster = r.filter() 1831 if cluster is None: 1832 if callback: 1833 callback(ResidueEventInfo(r.native.rank, 0, 0, False)) 1834 continue 1835 1836 if cluster.confidence >= threshold: 1837 covered.add(r.native.rank) 1838 elif callback: 1839 callback(ResidueEventInfo(r.native.rank, cluster.confidence, cluster.count, False)) 1840 1841 for r in target.residues: 1842 if r.native.rank not in covered: # true for gaps and low-conf residues 1843 fragmap.mark_unconfident(r.native.rank) 1844 1845 for frag in filling: 1846 fragmap.complement(frag) 1847 1848 return fragmap
1849
1850 - def make_filtered(self, target, extend=False, callback=None):
1851 """ 1852 Builed a filtered fragment library (by clustering), containing only 1853 representative fragments (cluster centroids). 1854 1855 @param target: target protein 1856 @type target: L{Target} 1857 @param extend: if True, pick alternative reps if available 1858 @type extend: bool 1859 1860 @return: filtered fragment library 1861 @rtype: L{RosettaFragmentMap} 1862 """ 1863 1864 fragments = [] 1865 1866 for r in target.residues: 1867 if r.assignments.length == 0: 1868 continue 1869 1870 cluster = r.filter(extend=extend) 1871 if cluster is None: 1872 continue 1873 1874 if extend and cluster.has_alternative: 1875 best = cluster.alternative 1876 else: 1877 best = cluster.centroid 1878 1879 fragment = self.rosetta.RosettaFragment.from_object(best) 1880 fragments.append(fragment) 1881 if callback: 1882 callback(ResidueEventInfo(r.native.rank, cluster.confidence, cluster.count, rep=cluster.centroid)) 1883 1884 fragments.sort() 1885 return self.rosetta.RosettaFragmentMap(fragments, target.length)
1886
1887 - def mix(self, *fragsets):
1888 """ 1889 Mix fragments from multiple libraries. 1890 1891 @type fragsets: L{RosettaFragmentMap} 1892 @return: mixed fragment library 1893 @rtype: L{RosettaFragmentMap} 1894 """ 1895 1896 fragments = [] 1897 length = 0 1898 1899 for fragset in fragsets: 1900 if fragset._length > length: 1901 length = fragset._length 1902 1903 for fragment in fragset: 1904 fragments.append(fragment) 1905 1906 return self.rosetta.RosettaFragmentMap(fragments, length)
1907
1908 1909 -class BenchmarkAdapter(object):
1910
1911 - class Connection(object):
1912 1913 FACTORY = None 1914 DSN = None 1915
1916 - def __init__(self, factory=None, dsn=None):
1917 1918 self.factory = factory or self.__class__.FACTORY 1919 self.cs = dsn or self.__class__.DSN 1920 self.connection = None 1921 self.cursor = None
1922
1923 - def __enter__(self):
1924 1925 self.connection = self.factory(self.cs) 1926 try: 1927 self.cursor = self.connection.cursor() 1928 except: 1929 self.connection.close() 1930 raise 1931 return self
1932
1933 - def __exit__(self, *args):
1934 try: 1935 if not self.cursor.closed: 1936 self.cursor.close() 1937 finally: 1938 if not self.connection.closed: 1939 self.connection.close()
1940
1941 - def __init__(self, pdb_paths, connection_string=None, factory=AssignmentFactory()):
1942 1943 self._pdb = pdb_paths 1944 self._connection = None 1945 1946 from csb.bio.io.wwpdb import find, StructureParser 1947 self._parser = StructureParser 1948 self._find = find 1949 self._factory = factory 1950 1951 try: 1952 import psycopg2.extras 1953 except ImportError: 1954 raise RuntimeError('Please install the psycopg2 module first') 1955 1956 if connection_string is None: 1957 connection_string = self.connection_string() 1958 1959 BenchmarkAdapter.Connection.FACTORY = psycopg2.extras.DictConnection 1960 BenchmarkAdapter.Connection.DSN = connection_string
1961 1962 @staticmethod
1963 - def connection_string(database='FragmentBenchmarks', host='', username='', password=''):
1964 1965 fields = ['dbname={0}'.format(database)] 1966 1967 if host: 1968 fields.append('host={0}'.format(host)) 1969 if username: 1970 fields.append('user={0}'.format(username)) 1971 fields.append('password={0}'.format(password)) 1972 1973 return ' '.join(fields)
1974
1975 - def targets(self, benchmark_id):
1976 1977 with BenchmarkAdapter.Connection() as db: 1978 1979 db.cursor.callproc('reporting."GetTargets"', (benchmark_id,)) 1980 return db.cursor.fetchall()
1981
1982 - def target_details(self, target_id):
1983 1984 with BenchmarkAdapter.Connection() as db: 1985 1986 db.cursor.callproc('reporting."GetTargetDetails"', (target_id,)) 1987 return db.cursor.fetchall()
1988
1989 - def assignments(self, target_id, type):
1990 1991 with BenchmarkAdapter.Connection() as db: 1992 1993 db.cursor.callproc('reporting."GetAssignments"', (target_id, type)) 1994 return db.cursor.fetchall()
1995
1996 - def assignments_sec_structure(self, target_id, type):
1997 1998 with BenchmarkAdapter.Connection() as db: 1999 2000 db.cursor.callproc('reporting."GetTargetSecStructureAssignments2"', (target_id, type)) 2001 return db.cursor.fetchall()
2002
2003 - def scores(self, benchmark_id, type):
2004 2005 with BenchmarkAdapter.Connection() as db: 2006 2007 db.cursor.callproc('reporting."GetScores"', (benchmark_id, type)) 2008 return db.cursor.fetchall()
2009
2010 - def centroids(self, benchmark_id):
2011 2012 with BenchmarkAdapter.Connection() as db: 2013 2014 db.cursor.callproc('reporting."GetCentroids"', (benchmark_id,)) 2015 return db.cursor.fetchall()
2016
2017 - def target_segments(self, target_id):
2018 2019 with BenchmarkAdapter.Connection() as db: 2020 2021 db.cursor.callproc('reporting."GetTargetSegments"', (target_id,)) 2022 data = db.cursor.fetchall() 2023 2024 return [ TargetSegment(row['Start'], row['End'], row['Count']) for row in data ]
2025
2026 - def structure(self, accession, chain=None):
2027 2028 pdbfile = self._find(accession, self._pdb) 2029 2030 if not pdbfile and chain: 2031 pdbfile = self._find(accession + chain, self._pdb) 2032 2033 if not pdbfile: 2034 raise IOError('{0} not found here: {1}'.format(accession, self._pdb)) 2035 2036 return self._parser(pdbfile).parse_structure()
2037
2038 - def prediction(self, target_id, type, ss=False):
2039 2040 info = self.target_details(target_id) 2041 if not info: 2042 raise ValueError('No such Target ID in the database: {0}'.format(target_id)) 2043 row = info[0] 2044 2045 id = row["Accession"] 2046 length = float(row["Length"]) 2047 overlap = float(row["MaxOverlap"]) / (length or 1.) 2048 2049 native = self.structure(id[:4], id[4]).chains[id[4]] 2050 segments = self.target_segments(target_id) 2051 target = self._factory.target(id, length, native.residues, overlap, segments) 2052 2053 source = None 2054 2055 for row in self.assignments(target_id, type): 2056 2057 src_accession = row['Source'][:4] 2058 src_chain = row['Source'][4] 2059 2060 if source is None or source.accession != src_accession: 2061 try: 2062 source = self.structure(src_accession, src_chain) 2063 except (IOError, ValueError) as ex: 2064 target.errors.append(ex) 2065 continue 2066 2067 if src_chain == '_': 2068 frag_chain = source.first_chain 2069 else: 2070 frag_chain = source.chains[src_chain] 2071 if not frag_chain.has_torsion: 2072 frag_chain.compute_torsion() 2073 2074 fragment = self._factory.assignment( 2075 source=frag_chain, 2076 start=row['SourceStart'], 2077 end=row['SourceEnd'], 2078 id=row['FragmentName'], 2079 qstart=row['Start'], 2080 qend=row['End'], 2081 probability=row['Probability'], 2082 score=row['Score'], 2083 neff=row['Neff'], 2084 rmsd=row['RMSD'], 2085 tm_score=row['TMScore'], 2086 segment=row['SegmentStart'], 2087 internal_id=row['InternalID']) 2088 2089 target.assign(fragment) 2090 2091 if ss: 2092 self._attach_sec_structure(target, target_id, type) 2093 2094 return target
2095
2096 - def _attach_sec_structure(self, target, target_id, type):
2097 2098 ss = {} 2099 2100 for row in self.assignments_sec_structure(target_id, type): 2101 frag_id, state = row["AssignmentID"], row["DSSP"] 2102 if row[frag_id] not in ss: 2103 ss[frag_id] = [] 2104 2105 ss[frag_id].append(state) 2106 2107 for a in target.matches: 2108 if a.internal_id in ss: 2109 dssp = ''.join(ss[a.internal_id]) 2110 a.secondary_structure = dssp
2111