Package csb :: Package bio :: Package io :: Module wwpdb
[frames] | no frames]

Source Code for Module csb.bio.io.wwpdb

   1  """ 
   2  PDB structure parsers, format builders and database providers. 
   3   
   4  The most basic usage is: 
   5   
   6      >>> parser = StructureParser('structure.pdb') 
   7      >>> parser.parse_structure() 
   8      <Structure>     # a Structure object (model) 
   9   
  10  or if this is an NMR ensemble: 
  11       
  12      >>> parser.parse_models() 
  13      <Ensemble>      # an Ensemble object (collection of alternative Structure-s) 
  14       
  15  This module introduces a family of PDB file parsers. The common interface of all 
  16  parsers is defined in L{AbstractStructureParser}. This class has several 
  17  implementations: 
  18   
  19      - L{RegularStructureParser} - handles normal PDB files with SEQRES fields 
  20      - L{LegacyStructureParser} - reads structures from legacy or malformed PDB files, 
  21        which are lacking SEQRES records (initializes all residues from the ATOMs instead) 
  22      - L{PDBHeaderParser} - reads only the headers of the PDB files and produces structures 
  23        without coordinates. Useful for reading metadata (e.g. accession numbers or just 
  24        plain SEQRES sequences) with minimum overhead 
  25         
  26  Unless you have a special reason, you should use the L{StructureParser} factory, 
  27  which returns a proper L{AbstractStructureParser} implementation, depending on the 
  28  input PDB file. If the input file looks like a regular PDB file, the factory 
  29  returns a L{RegularStructureParser}, otherwise it instantiates L{LegacyStructureParser}. 
  30  L{StructureParser} is in fact an alias for L{AbstractStructureParser.create_parser}. 
  31   
  32  Another important abstraction in this module is L{StructureProvider}. It has several 
  33  implementations which can be used to retrieve PDB L{Structure}s from various sources: 
  34  file system directories, remote URLs, etc. You can easily create your own provider as 
  35  well. See L{StructureProvider} for details. 
  36   
  37  Finally, this module gives you some L{FileBuilder}s, used for text serialization  
  38  of L{Structure}s and L{Ensemble}s: 
  39   
  40      >>> builder = PDBFileBuilder(stream) 
  41      >>> builder.add_header(structure) 
  42      >>> builder.add_structure(structure) 
  43   
  44  where stream is any Python stream, e.g. an open file or sys.stdout. 
  45   
  46  See L{Ensemble} and L{Structure} from L{csb.bio.structure} for details on these 
  47  objects. 
  48  """ 
  49   
  50  import re 
  51  import os 
  52  import numpy 
  53  import datetime 
  54  import multiprocessing 
  55   
  56  import csb.bio.structure 
  57  import csb.core 
  58  import csb.io 
  59   
  60  from abc import ABCMeta, abstractmethod 
  61  from csb.bio.sequence import SequenceTypes, SequenceAlphabets 
  62  from csb.bio.structure import ChemElements, SecStructures 
  63   
  64   
  65  PDB_AMINOACIDS = { 
  66      'PAQ': 'TYR', 'AGM': 'ARG', 'ILE': 'ILE', 'PR3': 'CYS', 'GLN': 'GLN', 
  67      'DVA': 'VAL', 'CCS': 'CYS', 'ACL': 'ARG', 'GLX': 'GLX', 'GLY': 'GLY', 
  68      'GLZ': 'GLY', 'DTH': 'THR', 'OAS': 'SER', 'C6C': 'CYS', 'NEM': 'HIS', 
  69      'DLY': 'LYS', 'MIS': 'SER', 'SMC': 'CYS', 'GLU': 'GLU', 'NEP': 'HIS', 
  70      'BCS': 'CYS', 'ASQ': 'ASP', 'ASP': 'ASP', 'SCY': 'CYS', 'SER': 'SER', 
  71      'LYS': 'LYS', 'SAC': 'SER', 'PRO': 'PRO', 'ASX': 'ASX', 'DGN': 'GLN', 
  72      'DGL': 'GLU', 'MHS': 'HIS', 'ASB': 'ASP', 'ASA': 'ASP', 'NLE': 'LEU', 
  73      'DCY': 'CYS', 'ASK': 'ASP', 'GGL': 'GLU', 'STY': 'TYR', 'SEL': 'SER', 
  74      'CGU': 'GLU', 'ASN': 'ASN', 'ASL': 'ASP', 'LTR': 'TRP', 'DAR': 'ARG', 
  75      'VAL': 'VAL', 'CHG': 'ALA', 'TPO': 'THR', 'CLE': 'LEU', 'GMA': 'GLU', 
  76      'HAC': 'ALA', 'AYA': 'ALA', 'THR': 'THR', 'TIH': 'ALA', 'SVA': 'SER', 
  77      'MVA': 'VAL', 'SAR': 'GLY', 'LYZ': 'LYS', 'BNN': 'ALA', '5HP': 'GLU', 
  78      'IIL': 'ILE', 'SHR': 'LYS', 'HAR': 'ARG', 'FME': 'MET', 'ALO': 'THR', 
  79      'PHI': 'PHE', 'ALM': 'ALA', 'PHL': 'PHE', 'MEN': 'ASN', 'TPQ': 'ALA', 
  80      'GSC': 'GLY', 'PHE': 'PHE', 'ALA': 'ALA', 'MAA': 'ALA', 'MET': 'MET', 
  81      'UNK': 'UNK', 'LEU': 'LEU', 'ALY': 'LYS', 'SET': 'SER', 'GL3': 'GLY', 
  82      'TRG': 'LYS', 'CXM': 'MET', 'TYR': 'TYR', 'SCS': 'CYS', 'DIL': 'ILE', 
  83      'TYQ': 'TYR', '3AH': 'HIS', 'DPR': 'PRO', 'PRR': 'ALA', 'CME': 'CYS', 
  84      'IYR': 'TYR', 'CY1': 'CYS', 'TYY': 'TYR', 'HYP': 'PRO', 'DTY': 'TYR', 
  85      '2AS': 'ASP', 'DTR': 'TRP', 'FLA': 'ALA', 'DPN': 'PHE', 'DIV': 'VAL', 
  86      'PCA': 'GLU', 'MSE': 'MET', 'MSA': 'GLY', 'AIB': 'ALA', 'CYS': 'CYS', 
  87      'NLP': 'LEU', 'CYQ': 'CYS', 'HIS': 'HIS', 'DLE': 'LEU', 'CEA': 'CYS', 
  88      'DAL': 'ALA', 'LLP': 'LYS', 'DAH': 'PHE', 'HMR': 'ARG', 'TRO': 'TRP', 
  89      'HIC': 'HIS', 'CYG': 'CYS', 'BMT': 'THR', 'DAS': 'ASP', 'TYB': 'TYR', 
  90      'BUC': 'CYS', 'PEC': 'CYS', 'BUG': 'LEU', 'CYM': 'CYS', 'NLN': 'LEU', 
  91      'CY3': 'CYS', 'HIP': 'HIS', 'CSO': 'CYS', 'TPL': 'TRP', 'LYM': 'LYS', 
  92      'DHI': 'HIS', 'MLE': 'LEU', 'CSD': 'ALA', 'HPQ': 'PHE', 'MPQ': 'GLY', 
  93      'LLY': 'LYS', 'DHA': 'ALA', 'DSN': 'SER', 'SOC': 'CYS', 'CSX': 'CYS', 
  94      'OMT': 'MET', 'DSP': 'ASP', 'PTR': 'TYR', 'TRP': 'TRP', 'CSW': 'CYS', 
  95      'EFC': 'CYS', 'CSP': 'CYS', 'CSS': 'CYS', 'SCH': 'CYS', 'OCS': 'CYS', 
  96      'NMC': 'GLY', 'SEP': 'SER', 'BHD': 'ASP', 'KCX': 'LYS', 'SHC': 'CYS', 
  97      'C5C': 'CYS', 'HTR': 'TRP', 'ARG': 'ARG', 'TYS': 'TYR', 'ARM': 'ARG', 
  98      'DNP': 'ALA' 
  99      } 
 100  """ 
 101  Dictionary of non-standard amino acids, which could be found in PDB. 
 102  """ 
 103   
 104   
 105  PDB_NUCLEOTIDES = { 
 106      'DA': 'Adenine', 'DG': 'Guanine', 'DC': 'Cytosine', 'DT': 'Thymine', 
 107       'A': 'Adenine', 'G': 'Guanine', 'C': 'Cytosine', 'T': 'Thymine', 
 108       'U': 'Uracil', 'DOC': 'Cytosine', 'R': 'Purine', 'Y': 'Pyrimidine', 
 109       'K': 'Ketone', '  M': 'Amino', 'S': 'Strong', 'W': 'Weak', 
 110       'B': 'NotA', 'D'  : 'NotC', 'H': 'NotG', 'V': 'NotT', 
 111       'N': 'Any', 'X'  : 'Masked' 
 112      } 
 113  """ 
 114  Dictionary of non-standard nucleotides, which could be found in PDB. 
 115  """ 
116 117 -class PDBParseError(ValueError):
118 pass
119
120 -class HeaderFormatError(PDBParseError):
121 pass
122
123 -class SecStructureFormatError(PDBParseError):
124 pass
125
126 -class StructureFormatError(PDBParseError):
127 pass
128
129 -class UnknownPDBResidueError(PDBParseError):
130 pass
131
132 -class StructureNotFoundError(KeyError):
133 pass
134
135 -class InvalidEntryIDError(StructureFormatError):
136 pass
137
138 139 -class EntryID(object):
140 """ 141 Represents a PDB Chain identifier. Implementing classes must define 142 how the original ID is split into accession number and chain ID. 143 144 @param id: identifier 145 @type id: str 146 """ 147 __metaclass__ = ABCMeta 148
149 - def __init__(self, id):
150 151 self._accession = '' 152 self._chain = '' 153 154 id = id.strip() 155 self._accession, self._chain = self.parse(id)
156 157 @staticmethod
158 - def create(id):
159 """ 160 Guess the format of C{id} and parse it. 161 162 @return: a new PDB ID of the appropriate type 163 @rtype: L{EntryID} 164 """ 165 166 if len(id) in (4, 5): 167 return StandardID(id) 168 elif len(id) == 6 and id[4] == '_': 169 return SeqResID(id) 170 else: 171 return DegenerateID(id)
172 173 @abstractmethod
174 - def parse(self, id):
175 """ 176 Split C{id} into accession number and chain ID. 177 178 @param id: PDB identifier 179 @type id: str 180 @return: (accession, chain) 181 @rtype: tuple of str 182 183 @raise InvalidEntryIDError: when C{id} is in an inappropriate format 184 """ 185 pass
186
187 - def format(self):
188 """ 189 @return: the identifier in its original format 190 @rtype: str 191 """ 192 return self.entry_id
193 194 @property
195 - def accession(self):
196 """ 197 Accession number part of the Entry ID 198 @rtype: str 199 """ 200 return self._accession
201 202 @property
203 - def chain(self):
204 """ 205 Chain ID part of the Entry ID 206 @rtype: str 207 """ 208 return self._chain
209 210 @property
211 - def entry_id(self):
212 """ 213 Accession number + Chain ID 214 @rtype: str 215 """ 216 return "{0.accession}{0.chain}".format(self)
217
218 - def __str__(self):
219 return self.entry_id
220
221 -class StandardID(EntryID):
222 """ 223 Standard PDB ID in the following form: xxxxY, where xxxx is the accession 224 number (lower case) and Y is an optional chain identifier. 225 """
226 - def parse(self, id):
227 228 if len(id) not in (4, 5): 229 raise InvalidEntryIDError(id) 230 231 return (id[:4].lower(), id[4:])
232
233 -class DegenerateID(EntryID):
234 """ 235 Looks like a L{StandardID}, except that the accession number may have 236 arbitrary length. 237 """
238 - def parse(self, id):
239 240 if len(id) < 2: 241 raise InvalidEntryIDError(id) 242 243 return (id[:-1].lower(), id[-1])
244
245 -class SeqResID(EntryID):
246 """ 247 Same as a L{StandardID}, but contains an additional underscore between 248 te accession number and the chain identifier. 249 """
250 - def parse(self, id):
251 252 if not (len(id) == 6 and id[4] == '_'): 253 raise InvalidEntryIDError(id) 254 255 return (id[:4].lower(), id[5:])
256
257 - def format(self):
258 return "{0.accession}_{0.chain}".format(self)
259
260 261 -class AbstractStructureParser(object):
262 """ 263 A base PDB structure format-aware parser. Subclasses must implement the 264 internal abstract method C{_parse_header} in order to complete the 265 implementation. 266 267 @param structure_file: the input PD file to parse 268 @type structure_file: str 269 @param check_ss: if True, secondary structure errors in the file will cause 270 L{SecStructureFormatError} exceptions 271 @type check_ss: bool 272 273 @raise IOError: when the input file cannot be found 274 """ 275 276 __metaclass__ = ABCMeta 277 278 @staticmethod
279 - def create_parser(structure_file, check_ss=False):
280 """ 281 A StructureParser factory, which instantiates and returns the proper parser 282 object based on the contents of the PDB file. 283 284 If the file contains a SEQRES section, L{RegularStructureParser} is returned, 285 otherwise L{LegacyStructureParser} is instantiated. In the latter case 286 LegacyStructureParser will read the sequence data directly from the ATOMs. 287 288 @param structure_file: the PDB file to parse 289 @type structure_file: str 290 291 @rtype: L{AbstractStructureParser} 292 """ 293 has_seqres = False 294 295 for line in open(structure_file): 296 if line.startswith('SEQRES'): 297 has_seqres = True 298 break 299 300 if has_seqres: 301 return RegularStructureParser(structure_file) 302 else: 303 return LegacyStructureParser(structure_file)
304
305 - def __init__(self, structure_file, check_ss=False):
306 307 self._file = None 308 self._stream = None 309 self._check_ss = bool(check_ss) 310 311 self.filename = structure_file
312
313 - def __del__(self):
314 try: 315 self._stream.close() 316 except: 317 pass
318 319 @property
320 - def filename(self):
321 """ 322 Current input PDB file name 323 @rtype: str 324 """ 325 return self._file
326 @filename.setter
327 - def filename(self, name):
328 try: 329 stream = open(name) 330 except IOError: 331 raise IOError('File not found: {0}'.format(name)) 332 333 if self._stream: 334 try: 335 self._stream.close() 336 except: 337 pass 338 self._stream = stream 339 self._file = name
340
341 - def models(self):
342 """ 343 Find all available model identifiers in the structure. 344 345 @return: a list of model IDs 346 @rtype: list 347 """ 348 models = [] 349 check = set() 350 351 with open(self._file, 'r') as f: 352 for line in f: 353 if line.startswith('MODEL'): 354 model_id = int(line[10:14]) 355 if model_id in check: 356 raise StructureFormatError('Duplicate model identifier: {0}'.format(model_id)) 357 models.append(model_id) 358 check.add(model_id) 359 360 if len(models) > 0: 361 if not(min(check) == 1 and max(check) == len(models)): 362 raise StructureFormatError('Non-consecutive model identifier(s) encountered') 363 return models 364 else: 365 return []
366
367 - def guess_sequence_type(self, residue_name):
368 """ 369 Try to guess what is the sequence type of a PDB C{residue_name}. 370 371 @param residue_name: a PDB-conforming name of a residue 372 @type residue_name: str 373 374 @return: a L{SequenceTypes} enum member 375 @rtype: L{csb.core.EnumItem} 376 377 @raise UnknownPDBResidueError: when there is no such PDB residue name 378 in the catalog tables 379 """ 380 if residue_name in PDB_AMINOACIDS: 381 return SequenceTypes.Protein 382 elif residue_name in PDB_NUCLEOTIDES: 383 return SequenceTypes.NucleicAcid 384 else: 385 raise UnknownPDBResidueError(residue_name)
386
387 - def parse_residue(self, residue_name, as_type=None):
388 """ 389 Try to parse a PDB C{residue_name} and return its closest 'normal' 390 string representation. If a sequence type (C{as_type}) is defined, 391 guess the alphabet based on that information, otherwise try first to 392 parse it as a protein residue. 393 394 @param residue_name: a PDB-conforming name of a residue 395 @type residue_name: str 396 @param as_type: suggest a sequence type (L{SequenceTypes} member) 397 @type L{scb.core.EnumItem} 398 399 @return: a normalized residue name 400 @rtype: str 401 402 @raise UnknownPDBResidueError: when there is no such PDB residue name 403 in the catalog table(s) 404 """ 405 if as_type is None: 406 as_type = self.guess_sequence_type(residue_name) 407 408 try: 409 if as_type == SequenceTypes.Protein: 410 return PDB_AMINOACIDS[residue_name] 411 elif as_type == SequenceTypes.NucleicAcid: 412 return PDB_NUCLEOTIDES[residue_name] 413 else: 414 raise UnknownPDBResidueError(residue_name) 415 except KeyError: 416 raise UnknownPDBResidueError(residue_name)
417
418 - def parse_residue_safe(self, residue_name, as_type):
419 """ 420 Same as C{parse_residue}, but returns UNK/Any instead of raising 421 UnknownPDBResidueError. 422 423 @param residue_name: a PDB-conforming name of a residue 424 @type residue_name: str 425 @param as_type: suggest a sequence type (L{SequenceTypes} member) 426 @type L{scb.core.EnumItem} 427 428 @return: a normalized residue name 429 @rtype: str 430 """ 431 try: 432 return self.parse_residue(residue_name, as_type) 433 except UnknownPDBResidueError: 434 if as_type == SequenceTypes.Protein: 435 return repr(SequenceAlphabets.Protein.UNK) 436 elif as_type == SequenceTypes.NucleicAcid: 437 return repr(SequenceAlphabets.Nucleic.Any) 438 else: 439 return repr(SequenceAlphabets.Unknown.UNK)
440
441 - def parse(self, filename=None, model=None):
442 443 if filename: 444 self.filename = filename 445 return self.parse_structure(model)
446
447 - def parse_structure(self, model=None):
448 """ 449 Parse and return the L{Structure} with the specified model identifier. 450 If no explicit model is specified, parse the first model in the 451 structure. 452 453 @param model: parse exactly the model with this ID 454 @type model: str 455 456 @return: object representation of the selected model 457 @rtype: L{Structure} 458 459 @raise ValueError: When an invalid model ID is specified 460 @raise PDBParseError: When the input PDB file suffers from unrecoverable 461 corruption. More specialized exceptions will be 462 raised depending on the context (see L{PDBParseError}'s 463 subclasses). 464 """ 465 if model is not None: 466 model = int(model) 467 468 structure = self._parse_header(model) 469 470 self._parse_atoms(structure, model) 471 self._parse_ss(structure) 472 473 return structure
474
475 - def parse_models(self, models=()):
476 """ 477 Parse the specified models in the file and build an L{Ensemble}. 478 479 @param models: an iterable object providing model identifiers. 480 If not specified, all models will be parsed. 481 @type models: tuple 482 483 @return: an ensemble with all parsed models 484 @rtype: L{Ensemble} 485 """ 486 487 if not models: 488 models = self.models() 489 else: 490 models = list(map(int, models)) 491 492 ensemble = csb.bio.structure.Ensemble() 493 494 if len(models) > 0: 495 for model_id in models: 496 model = self.parse_structure(model_id) 497 ensemble.models.append(model) 498 else: 499 model = self.parse_structure() 500 model.model_id = 1 501 ensemble.models.append(model) 502 503 return ensemble
504
505 - def parse_biomolecule(self, number=1, single=False):
506 """ 507 Parse and return the L{Structure} of the biological unit (quaternary 508 structure) as annotated by the REMARK 350 BIOMOLECULE record. 509 510 @param number: biomolecule number 511 @type number: int 512 513 @param single: if True, assign new single-letter chain 514 identifiers. If False, assign multi-letter chain identifiers whith a 515 number appended to the original identifier, like "A1", "A2", ... 516 @type single: bool 517 518 @return: structure of biological unit 519 @rtype: L{Structure} 520 """ 521 remarks = self._parse_remarks() 522 if 350 not in remarks: 523 raise PDBParseError('There is no REMARK 350') 524 525 current = 1 526 biomt = {current: {}} 527 chains = tuple() 528 529 def split(line): 530 return [c.strip() for c in line.split(',') if c.strip() != '']
531 532 for line in remarks[350]: 533 if line.startswith('BIOMOLECULE:'): 534 current = int(line[12:]) 535 biomt[current] = {} 536 elif line.startswith('APPLY THE FOLLOWING TO CHAINS:'): 537 chains = tuple(split(line[30:])) 538 elif line.startswith(' AND CHAINS:'): 539 chains += tuple(split(line[30:])) 540 elif line.startswith(' BIOMT'): 541 num = int(line[8:12]) 542 vec = line[12:].split() 543 vec = list(map(float, vec)) 544 biomt[current].setdefault(chains, dict()).setdefault(num, []).extend(vec) 545 546 if number not in biomt or len(biomt[number]) == 0: 547 raise KeyError('no BIOMOLECULE number {0}'.format(number)) 548 549 asu = self.parse_structure() 550 structure = csb.bio.structure.Structure('{0}_{1}'.format(asu.accession, number)) 551 552 newchainiditer = iter('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789') 553 554 for chains, matrices in biomt[number].items(): 555 for num in matrices: 556 mat = numpy.array(matrices[num][0:12]).reshape((3,4)) 557 R, t = mat[:3,:3], mat[:3,3] 558 559 for chain in chains: 560 if chain not in asu: 561 raise PDBParseError('chain {0} missing'.format(chain)) 562 copy = asu[chain].clone() 563 copy.transform(R, t) 564 if single: 565 if len(structure.chains) == 62: 566 raise ValueError('too many chains for single=True') 567 copy.id = next(newchainiditer) 568 else: 569 copy.id = '{0}{1}'.format(chain, num) 570 structure.chains.append(copy) 571 572 return structure
573 574 @abstractmethod
575 - def _parse_header(self, model):
576 """ 577 An abstract method, which subclasses must use to customize the way the 578 PDB header (or the absence of a such) is handled. The implementation 579 must rely on reading character data from the current internal 580 self._stream and must return a new L{csb.bio.structure.Structure} 581 instance with properly initialized header data: accession, model, 582 molecule identifiers, chains and residues. This structure object is 583 then internally passed to the C{_parse_atoms} hook, responsible for 584 attachment of the atoms to the residues in the structure. 585 """ 586 pass
587
588 - def _scroll_model(self, model, stream):
589 """ 590 Scroll the C{stream} to the specified C{model}. 591 """ 592 593 while True: 594 try: 595 line = next(stream) 596 except StopIteration: 597 raise ValueError('No such model {0} in the structure.'.format(model)) 598 599 if line.startswith('MODEL'): 600 model_id = int(line[10:14]) 601 if model == model_id: 602 return model_id
603
604 - def _parse_atoms(self, structure, model):
605 """ 606 Parse the ATOMs from the specified C{model} and attach them to the 607 C{structure}. 608 """ 609 610 structure.model_id = None 611 612 atoms = dict( (chain, []) for chain in structure.chains ) 613 chains = set() 614 total_chains = len([c for c in structure.items if c.length > 0]) 615 het_residues = dict( (chain, set()) for chain in structure.chains ) 616 in_ligands = False 617 in_atom = False 618 619 self._stream.seek(0) 620 while True: 621 622 try: 623 line = next(self._stream) 624 except StopIteration: 625 break 626 627 if line.startswith('HET '): 628 het_residue, het_chain, het_residue_id = line[7:10].strip(), line[12], line[13:18].strip() 629 630 if het_chain in structure: 631 chain = structure.chains[het_chain] 632 if chain.type == SequenceTypes.Protein and het_residue in PDB_AMINOACIDS: 633 het_residues[het_chain].add(het_residue_id) 634 elif chain.type == SequenceTypes.NucleicAcid and het_residue in PDB_NUCLEOTIDES: 635 het_residues[het_chain].add(het_residue_id) 636 637 elif line.startswith('MODEL'): 638 if model and model != int(line[10:14]): 639 self._scroll_model(model, self._stream) 640 structure.model_id = model 641 else: 642 model = int(line[10:14]) 643 structure.model_id = model 644 645 elif line.startswith('ATOM') \ 646 or (line.startswith('HETATM') and not in_ligands): 647 in_atom = True 648 649 rank = int(line[22:26]) 650 serial_number = int(line[6:11]) 651 name = line[12:16] 652 x, y, z = line[30:38], line[38:46], line[46:54] 653 vector = numpy.array([float(x), float(y), float(z)]) 654 element = line[76:78].strip() 655 if element: 656 try: 657 element = csb.core.Enum.parsename(ChemElements, element) 658 except csb.core.EnumMemberError: 659 if element in ('D', 'X'): 660 element = ChemElements.x 661 else: 662 raise StructureFormatError('Unknown chemical element: {0}'.format(element)) 663 else: 664 element = None 665 666 atom = csb.bio.structure.Atom(serial_number, name, element, 667 vector) 668 669 atom._het = line.startswith('HETATM') 670 atom._rank = rank 671 atom._sequence_number = int(line[22:26].strip()) 672 atom._residue_id = str(atom._sequence_number) 673 atom._insertion_code = line[26].strip() 674 if not atom._insertion_code: 675 atom._insertion_code = None 676 else: 677 atom._residue_id += atom._insertion_code 678 679 atom.alternate = line[16].strip() 680 if not atom.alternate: 681 atom.alternate = None 682 683 atom._chain = line[21].strip() 684 if atom._chain not in structure.chains: 685 raise StructureFormatError( 686 'Atom {0}: chain {1} is undefined'.format(serial_number, atom._chain)) 687 chains.add(atom._chain) 688 residue_name = line[17:20].strip() 689 residue_name = self.parse_residue_safe(residue_name, as_type=structure.chains[atom._chain].type) 690 if structure.chains[atom._chain].type == SequenceTypes.NucleicAcid: 691 atom._residue_name = csb.core.Enum.parsename(SequenceAlphabets.Nucleic, residue_name) 692 else: 693 atom._residue_name = csb.core.Enum.parsename(SequenceAlphabets.Protein, residue_name) 694 695 atom.occupancy = float(line[54:60].strip() or 0) 696 atom.bfactor = float(line[60:66].strip() or 0) 697 698 charge = line[78:80].strip() 699 if charge: 700 if charge in ('+', '-'): 701 charge += '1' 702 if charge[-1] in ('+', '-'): 703 charge = charge[::-1] 704 atom.charge = int(charge) 705 706 atoms[atom._chain].append(atom) 707 708 elif in_atom and line.startswith('TER'): 709 in_atom = False 710 if len(chains) == total_chains: 711 in_ligands = True 712 713 elif line.startswith('ENDMDL'): 714 break 715 716 elif line.startswith('END'): 717 break 718 719 if structure.model_id != model: 720 raise ValueError('No such model {0} in the structure.'.format(model)) 721 722 self._map_residues(structure, atoms, het_residues)
723
724 - def _map_residues(self, structure, atoms, het_residues):
725 726 assert set(structure.chains) == set(atoms.keys()) 727 728 for chain in structure.chains: 729 730 subject = structure.chains[chain].sequence 731 query = '' 732 fragments = [] 733 734 seq_numbers = [] 735 lookup = {} 736 737 i = -1 738 for a in atoms[chain]: 739 if a._residue_id not in lookup: 740 i += 1 741 lookup[a._residue_id] = [a._sequence_number, a._insertion_code] 742 seq_numbers.append(a._residue_id) 743 res_name = a._residue_name.value 744 res_id = '{0}{1}'.format(a._sequence_number or '', a._insertion_code or '').strip() 745 if a._het and res_id not in het_residues[chain]: 746 # if it is a HETATM, but not a modified residue, initiate an optional fragment 747 fragments.append([res_name, '?']) 748 elif a._insertion_code and not (i > 0 and lookup[seq_numbers[i - 1]][1]): 749 fragments.append([res_name]) 750 elif i == 0 or a._sequence_number - lookup[seq_numbers[i - 1]][0] not in (0, 1, -1): 751 # if residues [i, i-1] are not consecutive or 'overlapping', initiate a new fragment: 752 fragments.append([res_name]) 753 else: 754 # then they are consecutive 755 if fragments[-1][-1] == '?': 756 # but the prev was optional (maybe HET), all optionals *MUST* reside in 757 # singleton fragments, so start a new fragment 758 fragments.append([res_name]) 759 else: 760 # append the new residue to the end of the last fragment 761 fragments[-1].append(res_name) 762 763 for i, frag in enumerate(fragments): 764 fragments[i] = ''.join(frag) 765 if len(fragments) > 100: 766 # Python's regex engine will crash with more than 100 groups 767 raise StructureFormatError("Can't map structure with more than 100 fragments in ATOMS") 768 query = '^.*?({0}).*?$'.format(').*?('.join(fragments)) 769 770 matches = re.match(query, subject) 771 772 if matches: 773 seq_numitem = -1 774 for frag_number, frag in enumerate(matches.groups(), start=1): 775 if frag is '': 776 # optional fragment, which did not match (NOTE: all optionals must occupy 777 # their own fragments of length 1 residue) 778 seq_numitem += 1 # this 1 implies that all optional fragments are 1 residue long 779 else: 780 for i, dummy in enumerate(frag, start=1): 781 seq_numitem += 1 782 # lookup[res_id] is finally set to give the final rank of residue under id res_id: 783 try: 784 lookup[ seq_numbers[seq_numitem] ] = matches.start(frag_number) + i 785 except: 786 raise 787 788 fixed_residue = None 789 for atom in atoms[chain]: 790 if not isinstance(lookup[atom._residue_id], int): 791 continue # this atom was not mapped (e.g. HET) 792 atom._rank = lookup[atom._residue_id] 793 residue = structure.chains[chain].residues[atom._rank] 794 if residue is not fixed_residue: 795 residue.id = atom._sequence_number, atom._insertion_code 796 fixed_residue = residue 797 798 assert str(residue.type) == subject[atom._rank - 1] 799 residue.atoms.append(atom) 800 801 del atom._rank 802 del atom._insertion_code 803 del atom._sequence_number 804 del atom._chain 805 del atom._residue_id 806 del atom._residue_name 807 else: 808 if structure.chains[chain].length == 0 and len(atoms[chain]) > 0: 809 raise StructureFormatError("Can't add atoms: chain {0} has no residues".format(chain)) 810 else: 811 raise StructureFormatError("Can't map atoms")
812
813 - def _parse_ss(self, structure):
814 """ 815 Parse and attach secondary structure data. 816 817 @bug: Currently the PDB helix types are ignored. Each HELIX line is treated 818 as a regular SecStructures.Helix. This is due to incompatibility 819 between DSSP and PDB helix types. 820 @todo: Implement a proper workaround for the previous bug (e.g. skip all 821 helices types not included in the DSSP enum) 822 823 @warning: In this implementation only the start/end positions of the SS 824 elements are parsed. Additional data like H-bonding is ignored. 825 826 @bug: Currently structure.to_pdb() is not writing any SS data. 827 """ 828 elements = {} 829 self._stream.seek(0) 830 831 while True: 832 try: 833 line = next(self._stream) 834 except StopIteration: 835 break 836 837 if line.startswith('HELIX'): 838 839 chain = structure.chains[line[19].strip()] 840 if chain.id not in elements: 841 elements[chain.id] = [] 842 if chain.id != line[31].strip(): 843 if self._check_ss: 844 raise SecStructureFormatError('Helix {0} spans multiple chains'.format(line[7:10])) 845 else: 846 continue 847 try: 848 startres = chain.find(line[21:25].strip(), line[25].strip()) 849 endres = chain.find(line[33:37].strip(), line[37].strip()) 850 except csb.core.ItemNotFoundError as ex: 851 if self._check_ss: 852 raise SecStructureFormatError( 853 'Helix {0} refers to an undefined residue ID: {1}'.format(line[7:10], str(ex))) 854 else: 855 continue 856 if not startres.rank <= endres.rank: 857 if self._check_ss: 858 raise SecStructureFormatError('Helix {0} is out of range'.format(line[7:10])) 859 else: 860 continue 861 helix = csb.bio.structure.SecondaryStructureElement(startres.rank, endres.rank, SecStructures.Helix) 862 elements[chain.id].append(helix) 863 864 if line.startswith('SHEET'): 865 866 chain = structure.chains[line[21].strip()] 867 if chain.id not in elements: 868 elements[chain.id] = [] 869 if chain.id != line[32].strip(): 870 if self._check_ss: 871 raise SecStructureFormatError('Sheet {0} spans multiple chains'.format(line[7:10])) 872 else: 873 continue 874 try: 875 startres = chain.find(line[22:26].strip(), line[26].strip()) 876 endres = chain.find(line[33:37].strip(), line[37].strip()) 877 except csb.core.ItemNotFoundError as ex: 878 if self._check_ss: 879 raise SecStructureFormatError( 880 'Sheet {0} refers to an undefined residue ID: {1}'.format(line[7:10], str(ex))) 881 else: 882 continue 883 if not startres.rank <= endres.rank: 884 if self._check_ss: 885 raise SecStructureFormatError('Sheet {0} is out of range'.format(line[7:10])) 886 else: 887 continue 888 strand = csb.bio.structure.SecondaryStructureElement(startres.rank, endres.rank, SecStructures.Strand) 889 elements[chain.id].append(strand) 890 891 elif line.startswith('MODEL') or line.startswith('ATOM'): 892 break 893 894 for chain_id in elements: 895 ss = csb.bio.structure.SecondaryStructure() 896 for e in elements[chain_id]: 897 ss.append(e) 898 structure.chains[chain_id].secondary_structure = ss
899
900 - def _parse_remarks(self):
901 """ 902 Read REMARK lines from PDB file. 903 904 @return: dictionary with remark numbers as keys, and lists of lines as values. 905 @rtype: dict 906 """ 907 self._stream.seek(0) 908 909 remarks = {} 910 911 for line in self._stream: 912 if line.startswith('REMARK'): 913 num = int(line[7:10]) 914 lstring = line[11:] 915 remarks.setdefault(num, []).append(lstring) 916 elif line.startswith('DBREF') or line.startswith('ATOM'): 917 break 918 919 return remarks
920
921 922 -class RegularStructureParser(AbstractStructureParser):
923 """ 924 This is the de facto PDB parser, which is designed to read SEQRES and ATOM 925 sections separately, and them map them. Intentionally fails to parse 926 malformed PDB files, e.g. a PDB file without a HEADER section. 927 """ 928
929 - def _parse_header(self, model):
930 """ 931 Parse the HEADER section of a regular PDB file. 932 933 @return: a L{csb.bio.structure.Structure} instance with properly 934 initialized residues from the SEQRES. 935 @rtype: L{csb.bio.structure.Structure} 936 937 @raise PDBParseError: if the stream has no HEADER at byte 0 938 """ 939 940 self._stream.seek(0) 941 942 header = next(self._stream) 943 if not header.startswith('HEADER'): 944 raise PDBParseError('Does not look like a regular PDB file.') 945 946 structure = csb.bio.structure.Structure(header.split()[-1]) 947 948 while True: 949 950 try: 951 line = next(self._stream) 952 except StopIteration: 953 break 954 955 if line.startswith('COMPND'): 956 if line[10:].lstrip().startswith('MOL_ID:'): 957 mol_id = int(line[18:].replace(';', '').strip()) 958 chain_name = '' 959 chains = '' 960 while line.startswith('COMPND'): 961 line = next(self._stream) 962 if line.split()[2].startswith('MOLECULE:'): 963 chain_name += line[20:].strip() 964 while not chain_name.endswith(';'): 965 line = next(self._stream) 966 if not line.startswith('COMPND'): 967 break 968 chain_name += ' ' + line[11:].strip() 969 else: 970 while not line.split()[2].startswith('CHAIN:'): 971 line = next(self._stream) 972 if not line.startswith('COMPND'): 973 raise HeaderFormatError('Missing chain identifier in COMPND section') 974 chains = line[17:].strip() 975 while not chains.endswith(';'): 976 line = next(self._stream) 977 if not line.startswith('COMPND'): 978 break 979 chains += ', ' + line[11:].strip() 980 break 981 982 chain_name = chain_name.strip()[:-1] 983 for chain in chains.replace(';', ' ').replace(',', ' ').split() or ['']: # the second part deals with an empty chain id 984 new_chain = csb.bio.structure.Chain(chain, type=SequenceTypes.Unknown, 985 name=chain_name, accession=structure.accession) 986 new_chain.molecule_id = mol_id 987 try: 988 structure.chains.append(new_chain) 989 except csb.core.DuplicateKeyError: 990 raise HeaderFormatError('Chain {0} is already defined.'.format(new_chain.id)) 991 992 elif line.startswith('REMARK 2 RESOLUTION'): 993 res = re.search("(\d+(?:\.\d+)?)\s+ANGSTROM", line) 994 if res and res.groups(): 995 structure.resolution = float(res.group(1)) 996 997 elif line.startswith('SEQRES'): 998 # Correct handling of empty chain id 999 seq_fields = [line[7:10], line[11], line[13:17] ] 1000 seq_fields.extend(line[18:].split()) 1001 1002 rank_base = int(seq_fields[0]) 1003 chain_id = seq_fields[1].strip() 1004 1005 if chain_id not in structure.chains: 1006 raise HeaderFormatError('Chain {0} is undefined'.format(chain_id)) 1007 1008 chain = structure.chains[chain_id] 1009 1010 if chain.type == SequenceTypes.Unknown: 1011 inner_residuerank = int(len(seq_fields[3:]) / 2) + 3 1012 for i in [inner_residuerank, 3, -1]: 1013 try: 1014 chain.type = self.guess_sequence_type(seq_fields[i]) 1015 break 1016 except UnknownPDBResidueError: 1017 pass 1018 1019 for i, residue_name in enumerate(seq_fields[3:]): 1020 rank = rank_base * 13 - (13 - (i + 1)) 1021 rname = self.parse_residue_safe(residue_name, as_type=chain.type) 1022 residue = csb.bio.structure.Residue.create(chain.type, rank=rank, type=rname) 1023 residue._pdb_name = residue_name 1024 structure.chains[chain_id].residues.append(residue) 1025 assert structure.chains[chain_id].residues.last_index == rank 1026 1027 elif line.startswith('MODEL') or line.startswith('ATOM'): 1028 break 1029 1030 return structure
1031
1032 1033 -class PDBHeaderParser(RegularStructureParser):
1034 """ 1035 Ultra fast PDB HEADER parser. Does not read any structural data. 1036 """ 1037
1038 - def _parse_atoms(self, structure, model):
1039 pass
1040
1041 - def _parse_ss(self, structure):
1042 pass
1043
1044 - def _parse_header(self, model):
1045 return super(PDBHeaderParser, self)._parse_header(model)
1046
1047 1048 -class LegacyStructureParser(AbstractStructureParser):
1049 """ 1050 This is a customized PDB parser, which is designed to read both sequence and 1051 atom data from the ATOM section. This is especially useful when parsing PDB 1052 files without a header. 1053 """ 1054
1055 - def _parse_header(self, model):
1056 """ 1057 Initialize a structure with residues from the ATOMs section. 1058 1059 @param model: model identifier (e.g. if multiple models exist) 1060 @type model: str 1061 1062 @return: a L{csb.bio.structure.Structure} instance with properly 1063 initialized residues from ATOMs under the specified C{model}. 1064 @rtype: L{csb.bio.structure.Structure} 1065 """ 1066 1067 self._stream.seek(0) 1068 in_atom = False 1069 has_atoms = False 1070 chains = csb.core.OrderedDict() 1071 1072 header = next(self._stream) 1073 if header.startswith('HEADER'): 1074 structure = csb.bio.structure.Structure(header.split()[-1]) 1075 else: 1076 self._stream.seek(0) 1077 structure = csb.bio.structure.Structure('NONE') 1078 1079 structure.model_id = None 1080 1081 while True: 1082 1083 try: 1084 line = next(self._stream) 1085 except StopIteration: 1086 break 1087 1088 if line.startswith('MODEL'): 1089 if model and model != int(line[10:14]): 1090 self._scroll_model(model, self._stream) 1091 structure.model_id = model 1092 else: 1093 model = int(line[10:14]) 1094 structure.model_id = model 1095 1096 elif line.startswith('ATOM') \ 1097 or (in_atom and line.startswith('HETATM')): 1098 in_atom = True 1099 has_atoms = True 1100 1101 residue_id = line[22:27].strip() 1102 residue_name = line[17:20].strip() 1103 chain_id = line[21].strip() 1104 1105 if chain_id not in chains: 1106 chains[chain_id] = csb.core.OrderedDict() 1107 1108 new_chain = csb.bio.structure.Chain( 1109 chain_id, 1110 type=SequenceTypes.Unknown, 1111 accession=structure.accession) 1112 new_chain.molecule_id = '1' 1113 structure.chains.append(new_chain) 1114 1115 if residue_id not in chains[chain_id]: 1116 chains[chain_id][residue_id] = residue_name 1117 1118 if structure.chains[chain_id].type == SequenceTypes.Unknown: 1119 try: 1120 structure.chains[chain_id].type = self.guess_sequence_type(residue_name) 1121 except UnknownPDBResidueError: 1122 pass 1123 1124 elif in_atom and line.startswith('TER'): 1125 in_atom = False 1126 1127 elif line.startswith('ENDMDL'): 1128 break 1129 1130 elif line.startswith('END'): 1131 break 1132 1133 if not has_atoms: 1134 raise HeaderFormatError("Can't parse legacy structure: no ATOMs found") 1135 1136 for chain_id in structure.chains: 1137 chain = structure.chains[chain_id] 1138 1139 for residue_id in chains[chain_id]: 1140 residue_name = chains[chain_id][residue_id] 1141 rank = (chain.residues.last_index or 0) + 1 1142 1143 rname = self.parse_residue_safe(residue_name, as_type=structure.chains[chain_id].type) 1144 residue = csb.bio.structure.Residue.create(chain.type, rank=rank, type=rname) 1145 residue._pdb_name = residue_name 1146 chain.residues.append(residue) 1147 1148 return structure
1149 1150 1151 StructureParser = AbstractStructureParser.create_parser 1152 """ 1153 Alias for L{AbstractStructureParser.create_parser}. 1154 """
1155 1156 -class FileBuilder(object):
1157 """ 1158 Base abstract files for all structure file formatters. 1159 Defines a common step-wise interface according to the Builder pattern. 1160 1161 @param output: output stream (this is where the product is constructed) 1162 @type output: stream 1163 """ 1164 1165 __metaclass__ = ABCMeta 1166
1167 - def __init__(self, output):
1168 1169 if not hasattr(output, 'write'): 1170 raise TypeError(output) 1171 1172 def isnull(this, that, null=None): 1173 if this is null: 1174 return that 1175 else: 1176 return this
1177 1178 self._out = output 1179 self._isnull = isnull
1180 1181 @property
1182 - def output(self):
1183 """ 1184 Destination stream 1185 @rtype: stream 1186 """ 1187 return self._out
1188 1189 @property
1190 - def isnull(self):
1191 """ 1192 ISNULL(X, Y) function 1193 @rtype: callable 1194 """ 1195 return self._isnull
1196
1197 - def write(self, text):
1198 """ 1199 Write a chunk of text 1200 """ 1201 self._out.write(text)
1202
1203 - def writeline(self, text):
1204 """ 1205 Write a chunk of text and append a new line terminator 1206 """ 1207 self._out.write(text) 1208 self._out.write('\n')
1209 1210 @abstractmethod
1211 - def add_header(self, master_structure):
1212 pass
1213 1214 @abstractmethod
1215 - def add_structure(self, structure):
1216 pass
1217
1218 - def finalize(self):
1219 pass
1220
1221 -class PDBFileBuilder(FileBuilder):
1222 """ 1223 PDB file format builder. 1224 """ 1225
1226 - def writeline(self, text):
1227 self.write('{0:80}\n'.format(text))
1228
1229 - def add_header(self, master):
1230 """ 1231 Write the HEADER of the file using C{master} 1232 1233 @type master: L{Structure} 1234 """ 1235 1236 isnull = self.isnull 1237 1238 header = 'HEADER {0:40}{1:%d-%b-%y} {2:4}' 1239 self.writeline(header.format('.', datetime.datetime.now(), master.accession.upper())) 1240 1241 molecules = { } 1242 1243 for chain_id in master.chains: 1244 chain = master.chains[chain_id] 1245 if chain.molecule_id not in molecules: 1246 molecules[chain.molecule_id] = [ ] 1247 molecules[chain.molecule_id].append(chain_id) 1248 1249 k = 0 1250 for mol_id in sorted(molecules): 1251 1252 chains = molecules[mol_id] 1253 first_chain = master.chains[ chains[0] ] 1254 1255 self.writeline('COMPND {0:3} MOL_ID: {1};'.format(k + 1, isnull(mol_id, '0'))) 1256 self.writeline('COMPND {0:3} MOLECULE: {1};'.format(k + 2, isnull(first_chain.name, ''))) 1257 self.writeline('COMPND {0:3} CHAIN: {1};'.format(k + 3, ', '.join(chains))) 1258 k += 3 1259 1260 for chain_id in master.chains: 1261 1262 chain = master.chains[chain_id] 1263 res = [ r._pdb_name for r in chain.residues ] 1264 1265 rn = 0 1266 for j in range(0, chain.length, 13): 1267 rn += 1 1268 residues = [ '{0:>3}'.format(r) for r in res[j : j + 13] ] 1269 self.writeline('SEQRES {0:>3} {1} {2:>4} {3}'.format( 1270 rn, chain.id, chain.length, ' '.join(residues) ))
1271
1272 - def add_structure(self, structure):
1273 """ 1274 Append a new model to the file 1275 1276 @type structure: L{Structure} 1277 """ 1278 1279 isnull = self.isnull 1280 1281 for chain_id in structure.chains: 1282 1283 chain = structure.chains[chain_id] 1284 for residue in chain.residues: 1285 1286 atoms = [ ] 1287 for an in residue.atoms: 1288 atom = residue.atoms[an] 1289 if isinstance(atom, csb.bio.structure.DisorderedAtom): 1290 for dis_atom in atom: atoms.append(dis_atom) 1291 else: 1292 atoms.append(atom) 1293 atoms.sort() 1294 1295 for atom in atoms: 1296 1297 alt = atom.alternate 1298 if alt is True: 1299 alt = 'A' 1300 elif alt is False: 1301 alt = ' ' 1302 1303 if atom.element: 1304 element = repr(atom.element) 1305 else: 1306 element = ' ' 1307 self.writeline('ATOM {0:>5} {1:>4}{2}{3:>3} {4}{5:>4}{6} {7:>8.3f}{8:>8.3f}{9:>8.3f}{10:>6.2f}{11:>6.2f}{12:>12}{13:2}'.format( 1308 atom.serial_number, atom._full_name, isnull(alt, ' '), 1309 residue._pdb_name, chain.id, 1310 isnull(residue.sequence_number, residue.rank), isnull(residue.insertion_code, ' '), 1311 atom.vector[0], atom.vector[1], atom.vector[2], isnull(atom.occupancy, 0.0), isnull(atom.bfactor, 0.0), 1312 element, isnull(atom.charge, ' ') )) 1313 1314 self.writeline('TER')
1315
1316 - def finalize(self):
1317 """ 1318 Add the END marker 1319 """ 1320 self.writeline('END') 1321 self._out.flush()
1322
1323 -class PDBEnsembleFileBuilder(PDBFileBuilder):
1324 """ 1325 Supports serialization of NMR ensembles. 1326 1327 Functions as a simple decorator, which wraps C{add_structure} with 1328 MODEL/ENDMDL records. 1329 """ 1330
1331 - def add_structure(self, structure):
1332 1333 model_id = self.isnull(structure.model_id, 1) 1334 1335 self.writeline('MODEL {0:>4}'.format(model_id)) 1336 super(PDBEnsembleFileBuilder, self).add_structure(structure) 1337 self.writeline('ENDMDL')
1338
1339 1340 -class StructureProvider(object):
1341 """ 1342 Base class for all PDB data source providers. 1343 1344 Concrete classes need to implement the C{find} method, which abstracts the 1345 retrieval of a PDB structure file by a structure identifier. This is a hook 1346 method called internally by C{get}, but subclasses can safely override both 1347 C{find} and {get} to in order to achieve completely custom behavior. 1348 """ 1349 1350 __metaclass__ = ABCMeta 1351
1352 - def __getitem__(self, id):
1353 return self.get(id)
1354 1355 @abstractmethod
1356 - def find(self, id):
1357 """ 1358 Attempt to discover a PDB file, given a specific PDB C{id}. 1359 1360 @param id: structure identifier (e.g. 1x80) 1361 @type id: str 1362 @return: path and file name on success, None otherwise 1363 @rtype: str or None 1364 """ 1365 pass
1366
1367 - def get(self, id, model=None):
1368 """ 1369 Discover, parse and return the PDB structure, corresponding to the 1370 specified C{id}. 1371 1372 @param id: structure identifier (e.g. 1x80) 1373 @type id: str 1374 @param model: optional model identifier 1375 @type model: str 1376 1377 @rtype: L{csb.bio.Structure} 1378 1379 @raise StructureNotFoundError: when C{id} could not be found 1380 """ 1381 pdb = self.find(id) 1382 1383 if pdb is None: 1384 raise StructureNotFoundError(id) 1385 else: 1386 return StructureParser(pdb).parse_structure(model=model)
1387
1388 -class FileSystemStructureProvider(StructureProvider):
1389 """ 1390 Simple file system based PDB data source. Scans a list of local directories 1391 using pre-defined file name templates. 1392 1393 @param paths: a list of paths 1394 @type paths: iterable or str 1395 """ 1396
1397 - def __init__(self, paths=None):
1398 1399 self._templates = ['pdb{id}.ent', 'pdb{id}.pdb', '{id}.pdb', '{id}.ent'] 1400 self._paths = csb.core.OrderedDict() 1401 1402 if paths is not None: 1403 if isinstance(paths, csb.core.string): 1404 paths = [paths] 1405 1406 for path in paths: 1407 self.add(path)
1408 1409 @property
1410 - def paths(self):
1411 """ 1412 Current search paths 1413 @rtype: tuple 1414 """ 1415 return tuple(self._paths)
1416 1417 @property
1418 - def templates(self):
1419 """ 1420 Current file name match templates 1421 @rtype: tuple 1422 """ 1423 return tuple(self._templates)
1424
1425 - def add(self, path):
1426 """ 1427 Register a new local C{path}. 1428 1429 @param path: directory name 1430 @type path: str 1431 1432 @raise IOError: if C{path} is not a valid directory 1433 """ 1434 if os.path.isdir(path): 1435 self._paths[path] = path 1436 else: 1437 raise IOError(path)
1438
1439 - def add_template(self, template):
1440 """ 1441 Register a custom file name name C{template}. The template must contain 1442 an E{lb}idE{rb} macro, e.g. pdbE{lb}idE{rb}.ent 1443 1444 @param template: pattern 1445 @type template: str 1446 """ 1447 1448 if '{id}' not in template: 1449 raise ValueError('Template does not contain an "{id}" macro') 1450 1451 if template not in self._templates: 1452 self._templates.append(template)
1453
1454 - def remove(self, path):
1455 """ 1456 Unregister an existing local C{path}. 1457 1458 @param path: directory name 1459 @type path: str 1460 1461 @raise ValueError: if C{path} had not been registered 1462 """ 1463 if path not in self._paths: 1464 raise ValueError('path not found: {0}'.format(path)) 1465 1466 del self._paths[path]
1467
1468 - def find(self, id):
1469 1470 for path in self._paths: 1471 for token in self.templates: 1472 fn = os.path.join(path, token.format(id=id)) 1473 if os.path.exists(fn): 1474 return fn 1475 1476 return None
1477
1478 -class RemoteStructureProvider(StructureProvider):
1479 """ 1480 Retrieves PDB structures from a specified remote URL. 1481 The URL requested from remote server takes the form: <prefix>/<ID><suffix> 1482 1483 @param prefix: URL prefix, including protocol 1484 @type prefix: str 1485 @param suffix: optional URL suffix (.ent by default) 1486 @type suffix: str 1487 """ 1488
1489 - def __init__(self, prefix='http://www.rcsb.org/pdb/files/pdb', suffix='.ent'):
1490 1491 self._prefix = None 1492 self._suffix = None 1493 1494 self.prefix = prefix 1495 self.suffix = suffix
1496 1497 @property
1498 - def prefix(self):
1499 """ 1500 Current URL prefix 1501 @rtype: str 1502 """ 1503 return self._prefix
1504 @prefix.setter
1505 - def prefix(self, value):
1506 self._prefix = value
1507 1508 @property
1509 - def suffix(self):
1510 """ 1511 Current URL suffix 1512 @rtype: str 1513 """ 1514 return self._suffix
1515 @suffix.setter
1516 - def suffix(self, value):
1517 self._suffix = value
1518
1519 - def _find(self, id):
1520 1521 try: 1522 return csb.io.urllib.urlopen(self.prefix + id + self.suffix) 1523 except: 1524 raise StructureNotFoundError(id)
1525
1526 - def find(self, id):
1527 1528 stream = self._find(id) 1529 1530 try: 1531 tmp = csb.io.TempFile(dispose=False) 1532 tmp.write(stream.read().decode('utf-8')) 1533 tmp.flush() 1534 return tmp.name 1535 1536 except StructureNotFoundError: 1537 return None 1538 finally: 1539 stream.close()
1540
1541 - def get(self, id, model=None):
1542 1543 stream = self._find(id) 1544 1545 try: 1546 with csb.io.TempFile() as tmp: 1547 tmp.write(stream.read().decode('utf-8')) 1548 tmp.flush() 1549 return StructureParser(tmp.name).parse_structure(model=model) 1550 finally: 1551 stream.close()
1552
1553 -class CustomStructureProvider(StructureProvider):
1554 """ 1555 A custom PDB data source. Functions as a user-defined map of structure 1556 identifiers and their corresponding local file names. 1557 1558 @param files: initialization dictionary of id:file pairs 1559 @type files: dict-like 1560 """ 1561
1562 - def __init__(self, files={}):
1563 1564 self._files = {} 1565 for id in files: 1566 self.add(id, files[id])
1567 1568 @property
1569 - def paths(self):
1570 """ 1571 List of currently registered file names 1572 @rtype: tuple 1573 """ 1574 return tuple(self._files.values())
1575 1576 @property
1577 - def identifiers(self):
1578 """ 1579 List of currently registered structure identifiers 1580 @rtype: tuple 1581 """ 1582 return tuple(self._files)
1583
1584 - def add(self, id, path):
1585 """ 1586 Register a new local C{id}:C{path} pair. 1587 1588 @param id: structure identifier 1589 @type id: str 1590 @param path: path and file name 1591 @type path: str 1592 1593 @raise IOError: if C{path} is not a valid file name 1594 """ 1595 if os.path.isfile(path): 1596 self._files[id] = path 1597 else: 1598 raise IOError(path)
1599
1600 - def remove(self, id):
1601 """ 1602 Unregister an existing structure C{id}. 1603 1604 @param id: structure identifier 1605 @type id: str 1606 1607 @raise ValueError: if C{id} had not been registered 1608 """ 1609 if id not in self._files: 1610 raise ValueError(id) 1611 else: 1612 del self._files[id]
1613
1614 - def find(self, id):
1615 1616 if id in self._files: 1617 return self._files[id] 1618 else: 1619 return None
1620
1621 -def get(accession, model=None, prefix='http://www.rcsb.org/pdb/files/pdb'):
1622 """ 1623 Download and parse a PDB entry. 1624 1625 @param accession: accession number of the entry 1626 @type accession: str 1627 @param model: model identifier 1628 @type model: str 1629 @param prefix: download URL prefix 1630 @type prefix: str 1631 1632 @return: object representation of the selected model 1633 @rtype: L{Structure} 1634 """ 1635 return RemoteStructureProvider(prefix).get(accession, model=model)
1636
1637 -def find(id, paths):
1638 """ 1639 Try to discover a PDB file for PDB C{id} in C{paths}. 1640 1641 @param id: PDB ID of the entry 1642 @type id: str 1643 @param paths: a list of directories to scan 1644 @type paths: list of str 1645 1646 @return: path and file name on success, None otherwise 1647 @rtype: str 1648 """ 1649 return FileSystemStructureProvider(paths).find(id)
1650
1651 1652 -class AsyncParseResult(object):
1653
1654 - def __init__(self, result, exception):
1655 1656 self.result = result 1657 self.exception = exception
1658
1659 - def __repr__(self):
1660 return '<AsyncParseResult: result={0.result}, error={0.exception.__class__.__name__}>'.format(self)
1661
1662 1663 -def _parse_async(parser, file, model):
1664 p = parser(file) 1665 return p.parse_structure(model)
1666
1667 1668 -class AsyncStructureParser(object):
1669 """ 1670 Wraps StructureParser in an asynchronous call. Since a new process is 1671 started by Python internally (as opposed to only starting a new thread), 1672 this makes the parser slower, but provides a way to set a parse timeout 1673 limit. 1674 1675 If initialized with more than one worker, supports parallel parsing 1676 through the C{self.parse_async} method. 1677 1678 @param workers: number of worker threads (1 by default) 1679 @type workers: int 1680 """ 1681
1682 - def __init__(self, workers=1):
1683 1684 self._pool = None 1685 self._workers = 1 1686 1687 if int(workers) > 0: 1688 self._workers = int(workers) 1689 else: 1690 raise ValueError(workers) 1691 1692 self._recycle()
1693
1694 - def _recycle(self):
1695 1696 if self._pool: 1697 self._pool.terminate() 1698 1699 self._pool = multiprocessing.Pool(processes=self._workers)
1700
1701 - def parse_structure(self, structure_file, timeout, model=None, 1702 parser=RegularStructureParser):
1703 """ 1704 Call StructureParser.parse_structure() in a separate process and return 1705 the output. Raise TimeoutError if the parser does not respond within 1706 C{timeout} seconds. 1707 1708 @param structure_file: structure file to parse 1709 @type structure_file: str 1710 @param timeout: raise multiprocessing.TimeoutError if C{timeout} seconds 1711 elapse before the parser completes its job 1712 @type timeout: int 1713 @param parser: any implementing L{AbstractStructureParser} class 1714 @type parser: type 1715 1716 @return: parsed structure 1717 @rtype: L{csb.structure.Structure} 1718 """ 1719 1720 r = self.parse_async([structure_file], timeout, model, parser) 1721 if len(r) > 0: 1722 if r[0].exception is not None: 1723 raise r[0].exception 1724 else: 1725 return r[0].result 1726 return None
1727
1728 - def parse_async(self, structure_files, timeout, model=None, 1729 parser=RegularStructureParser):
1730 """ 1731 Call C{self.parse_structure} for a list of structure files 1732 simultaneously. The actual degree of parallelism will depend on the 1733 number of workers specified while constructing the parser object. 1734 1735 @note: Don't be tempted to pass a large list of structures to this 1736 method. Every time a C{TimeoutError} is encountered, the 1737 corresponding worker process in the pool will hang until the 1738 process terminates on its own. During that time, this worker is 1739 unusable. If a sufficiently high number of timeouts occur, the 1740 whole pool of workers will be unsable. At the end of the method 1741 however a pool cleanup is performed and any unusable workers 1742 are 'reactivated'. However, that only happens at B{the end} of 1743 C{parse_async}. 1744 1745 @param structure_files: a list of structure files 1746 @type structure_files: tuple of str 1747 @param timeout: raise multiprocessing.TimeoutError if C{timeout} seconds 1748 elapse before the parser completes its job 1749 @type timeout: int 1750 @param parser: any implementing L{AbstractStructureParser} class 1751 @type parser: type 1752 1753 @return: a list of L{AsyncParseResult} objects 1754 @rtype: list 1755 """ 1756 1757 pool = self._pool 1758 workers = [] 1759 results = [] 1760 1761 for file in list(structure_files): 1762 result = pool.apply_async(_parse_async, [parser, file, model]) 1763 workers.append(result) 1764 1765 hanging = False 1766 for w in workers: 1767 result = AsyncParseResult(None, None) 1768 try: 1769 result.result = w.get(timeout=timeout) 1770 except KeyboardInterrupt as ki: 1771 pool.terminate() 1772 raise ki 1773 except Exception as ex: 1774 result.exception = ex 1775 if isinstance(ex, multiprocessing.TimeoutError): 1776 hanging = True 1777 results.append(result) 1778 1779 if hanging: 1780 self._recycle() 1781 1782 return results
1783