1 """
2 FASTA format sequence I/O.
3
4 This module provides parsers and writers for sequences and alignments in
5 FASTA format. The most basic usage is:
6
7 >>> parser = SequenceParser()
8 >>> parser.parse_file('sequences.fa')
9 <SequenceCollection> # collection of L{AbstractSequence}s
10
11 This will load all sequences in memory. If you are parsing a huge file,
12 then you could efficiently read the file sequence by sequence:
13
14 >>> for seq in parser.read('sequences.fa'):
15 ... # seq is an L{AbstractSequence}
16
17 L{BaseSequenceParser} is the central class in this module, which defines a
18 common infrastructure for all sequence readers. L{SequenceParser} is a standard
19 implementation, and L{PDBSequenceParser} is specialized to read FASTA sequences
20 with PDB headers.
21
22 For parsing alignments, have a look at L{SequenceAlignmentReader} and
23 L{StructureAlignmentFactory}.
24
25 Finally, this module provides a number of L{OutputBuilder}s, which know how to
26 write L{AbstractSequence} and L{AbstractAlignment} objects to FASTA files:
27
28 >>> with open('file.fa', 'w') as out:
29 builder = OutputBuilder.create(AlignmentFormats.FASTA, out)
30 builder.add_alignment(alignment)
31 builder.add_sequence(sequence)
32 ...
33
34 or you could instantiate any of the L{OutputBuilder}s directly.
35 """
36
37 import csb.io
38 import csb.core
39
40 from abc import ABCMeta, abstractmethod
41
42 from csb.bio.sequence import SequenceTypes, SequenceAlphabets, AlignmentFormats, SequenceError
43 from csb.bio.sequence import SequenceAlignment, StructureAlignment, A3MAlignment
44 from csb.bio.sequence import SequenceCollection, AbstractSequence, Sequence, RichSequence, ChainSequence
48 """
49 FASTA parser template. Subclasses must implement the way FASTA strings are
50 handled by overriding C{BaseSequenceParser.read_sequence}.
51
52 @param product: sequence product factory (an L{AbstractSequence} subclass)
53 @type product: type
54 @param product_type: default L{SequenceTypes} member for the products
55 @type product_type: L{EnumItem}
56 """
57 __metaclass__ = ABCMeta
58
71
72 @property
74 """
75 Factory used to build sequence products
76 @rtype: class
77 """
78 return self._product
79
80 @property
82 """
83 Default sequence type of the products - a member of L{SequenceTypes}
84 @rtype: enum item
85 """
86 return self._type
87
88 @abstractmethod
90 """
91 Parse a single FASTA string
92
93 @return: a new sequence, created with L{BaseSequenceParser.product_factory}
94 @rtype: L{AbstractSequence}
95 """
96 pass
97
99 """
100 Read FASTA sequences from an (m)FASTA-formatted string
101
102 @param fasta_string: FASTA string to parse
103 @type fasta_string: str
104
105 @return: a list of L{Sequence}s
106 @rtype: L{SequenceCollection}
107 """
108
109 stream = csb.io.MemoryStream()
110 stream.write(fasta_string)
111
112 return self.parse_file(stream)
113
115 """
116 Read FASTA sequences from a (m)FASTA file
117
118 @param fasta_file: input FASTA file name or opened stream
119 @type fasta_file: str, file
120
121 @return: a list of L{Sequence}s
122 @rtype: L{SequenceCollection}
123 """
124 if isinstance(fasta_file, csb.core.string):
125 stream = open(fasta_file)
126 else:
127 stream = fasta_file
128
129 seqs = []
130 reader = csb.io.EntryReader(stream, AbstractSequence.DELIMITER, None)
131
132 for entry in reader.entries():
133 seqs.append(self.read_sequence(entry))
134
135 return SequenceCollection(seqs)
136
137 - def read(self, fasta_file):
138 """
139 Read FASTA sequences from an (m)FASTA file.
140
141 @param fasta_file: input FASTA file name or opened stream
142 @type fasta_file: str, file
143
144 @return: efficient cursor over all L{Sequence}s (parse on demand)
145 @rtype: iterator
146 """
147 if isinstance(fasta_file, csb.core.string):
148 stream = open(fasta_file)
149 else:
150 stream = fasta_file
151
152 reader = csb.io.EntryReader(stream, AbstractSequence.DELIMITER, None)
153
154 for entry in reader.entries():
155 yield self.read_sequence(entry)
156
158 """
159 Standard FASTA parser. See L{BaseSequenceParser} for details.
160 """
161
176
178 """
179 PDB FASTA parser. Reads the PDB ID and sequence type from the header.
180 See L{BaseSequenceParser} for more details.
181 """
182
198
201
202 - def __init__(self, sequences, insertion):
212
237
240
243
245 """
246 Sequence alignment parser.
247
248 @param product_type: default L{SequenceTypes} member for the sequence products
249 @type product_type: L{EnumItem}
250 @param strict: if True, raise exception on duplicate sequence identifiers.
251 See L{csb.bio.sequence.AbstractAlignment} for details
252 @type strict: bool
253 """
254
262
263 @property
265 """
266 Default sequence type of the alignment entries - a member of L{SequenceTypes}
267 @rtype: enum item
268 """
269 return self._type
270
271 @property
273 """
274 True if strict mode is enabled
275 @rtype: bool
276 """
277 return self._strict
278
293
329
331 """
332 Protein structure alignment parser.
333
334 In order to construct the structural alignment, this factory needs a PDB
335 structure provider: an object, whose C{provider.get} method returns a
336 L{csb.bio.structute.Structure} for a given sequence identifier. Sequence
337 identifiers on the other hand need to be split into 'accession number'
338 and 'chain ID'. By default this is done using a standard PDB Entry ID
339 factory, but clients are free to provide custom factories. An C{id_factory}
340 must be a callable, which accepts a single string identifier and returns
341 an EntryID object.
342
343 @param provider: data source for all structures found in the alignment
344 @type provider: L{csb.bio.io.wwpdb.StructureProvider}
345 @param id_factory: callable factory, which transforms a sequence ID into
346 a L{csb.bio.io.wwpdb.EntryID} object. By default
347 this is L{csb.bio.io.wwpdb.EntryID.create}.
348 @type id_factory: callable
349 @param strict: if True, raise exception on duplicate sequence identifiers.
350 See L{csb.bio.sequence.AbstractAlignment} for details
351 @type strict: bool
352 """
353
354 - def __init__(self, provider, id_factory=None, strict=True):
370
371 @property
373 """
374 Default sequence type of the alignment rows - a member of L{SequenceTypes}
375 @rtype: enum item
376 """
377 return self._type
378
379 @property
381 """
382 Current L{csb.bio.io.wwpdb.StructureProvider} instance in use
383 @rtype: L{StructureProvider}
384 """
385 return self._provider
386
387 @property
389 """
390 Current L{csb.bio.io.wwpdb.EntryID} factory instance in use
391 @rtype: L{EntryID}
392 """
393 return self._id_factory
394
395 @property
397 """
398 True if strict mode is enabled
399 @rtype: bool
400 """
401 return self._strict
402
432
433 - def make_entry(self, row, chain):
434 """
435 Build a protein structure alignment entry, given a sequence alignment
436 entry and its corresponding source PDB chain.
437
438 @param row: sequence alignment entry (sequence with gaps)
439 @type row: L{AbstractSequence}, L{SequenceAdapter}
440 @param chain: source PDB chain
441 @type chain: L{csb.bio.structure.Chain}
442
443 @return: gapped chain sequence, containing cloned residues from the
444 source chain (except for the gaps)
445 @rtype: L{ChainSequence}
446 @raise SequenceError: when C{row} is not a proper subsequence of C{chain}
447 """
448 offset = 1
449 residues = []
450 sequence = row.strip().sequence.upper()
451
452 try:
453 start = chain.sequence.index(sequence) + 1
454 except ValueError:
455 raise SequenceError('{0}: not a subsequence of {1}'.format(row.id, chain.entry_id))
456
457 for rinfo in row.residues:
458
459 if rinfo.type == row.alphabet.GAP:
460 residues.append(rinfo)
461 continue
462 else:
463 rank = start + offset - 1
464 assert chain.residues[rank].type == rinfo.type
465 residues.append(chain.residues[rank].clone())
466 offset += 1
467 continue
468
469 return ChainSequence(row.id, row.header, residues, chain.type)
470
473 """
474 Base sequence/alignment string format builder.
475
476 @param output: destination stream, where the product is written.
477 @type output: file
478 @param headers: if False, omit headers
479 @type headers: bool
480
481 @note: File builders are not guaranteed to check the correctness of the
482 product. It is assumed that the client of the builder knows what
483 it is doing.
484 """
485 __metaclass__ = ABCMeta
486 _registry = {}
487
488 - def __init__(self, output, headers=True):
489
490 if not hasattr(output, 'write'):
491 raise TypeError(output)
492
493 self._out = output
494 self._headers = bool(headers)
495
496 @staticmethod
498 """
499 Create an output builder, which knows how to handle the specified
500 sequence/alignment C{format}. Additional arguments are passed to the
501 builder's constructor.
502
503 @param format: L{AlignmentFormats} member
504 @type format: L{EnumItem}
505 @rtype: L{OutputBuilder}
506 """
507 if format not in OutputBuilder._registry:
508 raise ValueError('Unhandled format: {0}'.format(format))
509
510 klass = OutputBuilder._registry[format]
511 return klass(*a, **k)
512
513 @staticmethod
515 """
516 Register a new output builder.
517
518 @param format: L{AlignmentFormats} member
519 @type format: L{EnumItem}
520 @param klass: builder class (L{OutputBuilder} sub-class)
521 @type klass: type
522 """
523 assert format not in OutputBuilder._registry
524 assert issubclass(klass, OutputBuilder)
525
526 OutputBuilder._registry[format] = klass
527
528 @property
530 """
531 Destination stream
532 @rtype: stream
533 """
534 return self._out
535
536 @property
538 """
539 True if sequence headers will be written to the destination
540 @rtype: bool
541 """
542 return self._headers
543
545 """
546 Write a chunk of C{text} to the output stream.
547 """
548 self._out.write(text)
549
551 """
552 Write a chunk of C{text}, followed by a newline terminator.
553 """
554 self._out.write(text)
555 self._out.write('\n')
556
557 @abstractmethod
559 """
560 Format and append a new sequence to the product.
561 @type sequence: L{AbstractSequence}
562 """
563 pass
564
566 """
567 Format and append a collection of L{AbstractSequence}s to the product.
568 @type sequences: iterable of L{AbstractSequence}s
569 """
570 for s in sequences:
571 self.add_sequence(s)
572
573 @abstractmethod
575 """
576 Format and append an alignment to the product.
577 @type alignment: L{AbstractAlignment}
578 """
579 pass
580
582 """
583 Append a sequence separator to the product.
584 """
585 self.writeline(separator)
586
603
624
626 """
627 Formats sequences as A3M strings. When appending an alignment, this builder
628 will write all insertion-containing columns in lower case. Also, gap symbols
629 are omitted if the respective columns contain insertions.
630
631 See L{OutputBuilder}.
632 """
633 FORMAT = AlignmentFormats.A3M
634
642
649
673
697
699 """
700 Formats sequences as PIR FASTA strings, recognized by Modeller.
701 See L{OutputBuilder} for general alignment documentation.
702 """
703 FORMAT = AlignmentFormats.PIR
704
707
715
716
719
722
723 - def _add(self, sequence, template=True):
749
750
751
752 for klass in OutputBuilder.__subclasses__():
753 OutputBuilder.register(klass.FORMAT, klass)
754