Package csb :: Package apps :: Module hhfrag
[frames] | no frames]

Source Code for Module csb.apps.hhfrag

  1  """ 
  2  HHfrag: build a dynamic variable-length fragment library for protein structure 
  3  prediction with Rosetta AbInitio. 
  4  """ 
  5   
  6  import os 
  7  import multiprocessing 
  8   
  9  import csb.apps 
 10  import csb.apps.hhsearch as hhsearch 
 11   
 12  import csb.bio.io.hhpred 
 13  import csb.bio.fragments 
 14  import csb.bio.fragments.rosetta as rosetta 
 15  import csb.bio.structure 
 16   
 17  import csb.io 
 18  import csb.core 
19 20 21 -class ExitCodes(csb.apps.ExitCodes):
22 23 IO_ERROR = 2 24 INVALID_DATA = 3 25 HHSEARCH_FAILURE = 4 26 NO_OUTPUT = 5
27
28 29 -class AppRunner(csb.apps.AppRunner):
30 31 @property
32 - def target(self):
33 return HHfragApp
34
35 - def command_line(self):
36 37 cmd = csb.apps.ArgHandler(self.program, __doc__) 38 cpu = multiprocessing.cpu_count() 39 40 cmd.add_scalar_option('hhsearch', 'H', str, 'path to the HHsearch executable', default='hhsearch') 41 cmd.add_scalar_option('database', 'd', str, 'database directory (containing PDBS25.hhm)', required=True) 42 43 cmd.add_scalar_option('min', 'm', int, 'minimum query segment length', default=6) 44 cmd.add_scalar_option('max', 'M', int, 'maximum query segment length', default=21) 45 cmd.add_scalar_option('step', 's', int, 'query segmentation step', default=3) 46 cmd.add_scalar_option('cpu', 'c', int, 'maximum degree of parallelism', default=cpu) 47 48 cmd.add_scalar_option('verbosity', 'v', int, 'verbosity level', default=2) 49 cmd.add_scalar_option('output', 'o', str, 'output directory', default='.') 50 cmd.add_scalar_option('gap-filling', 'g', str, 'path to a Rosetta 9-mer fragment file, that will be used ' 51 'to complement gaps in the fragment map (if specified, a joint fragment file will be produced)') 52 cmd.add_boolean_option('filtered-map', 'f', 'make an additional filtered fragment map', default=False) 53 cmd.add_boolean_option('c-alpha', None, 'include also C-alpha vectors in the output', default=False) 54 55 cmd.add_positional_argument('QUERY', str, 'query profile HMM (e.g. created with csb.apps.buildhmm)') 56 57 return cmd
58
59 60 -class HHfragApp(csb.apps.Application):
61
62 - def main(self):
63 if not os.path.isdir(self.args.output): 64 HHfragApp.exit('Output directory does not exist', code=ExitCodes.INVALID_DATA, usage=True) 65 66 if self.args.c_alpha: 67 builder = rosetta.ExtendedOutputBuilder 68 else: 69 builder = rosetta.OutputBuilder 70 71 try: 72 hhf = HHfrag(self.args.QUERY, self.args.hhsearch, self.args.database, logger=self) 73 output = os.path.join(self.args.output, hhf.query.id) 74 75 hhf.slice_query(self.args.min, self.args.max, self.args.step, self.args.cpu) 76 frags = hhf.extract_fragments() 77 78 if len(frags) == 0: 79 HHfragApp.exit('No fragments found!', code=ExitCodes.NO_OUTPUT) 80 81 fragmap = hhf.build_fragment_map() 82 fragmap.dump(output + '.hhfrags.09', builder) 83 84 if self.args.filtered_map: 85 fragmap = hhf.build_filtered_map() 86 fragmap.dump(output + '.filtered.09', builder) 87 88 if self.args.gap_filling: 89 fragmap = hhf.build_combined_map(self.args.gap_filling) 90 fragmap.dump(output + '.complemented.09', builder) 91 92 self.log('\nDONE.') 93 94 except ArgumentIOError as ae: 95 HHfragApp.exit(str(ae), code=ExitCodes.IO_ERROR) 96 97 except ArgumentError as ae: 98 HHfragApp.exit(str(ae), code=ExitCodes.INVALID_DATA, usage=True) 99 100 except csb.io.InvalidCommandError as ose: 101 msg = '{0!s}: {0.program}'.format(ose) 102 HHfragApp.exit(msg, ExitCodes.IO_ERROR) 103 104 except csb.bio.io.hhpred.HHProfileFormatError as hfe: 105 msg = 'Corrupt HMM: {0!s}'.format(hfe) 106 HHfragApp.exit(msg, code=ExitCodes.INVALID_DATA) 107 108 except csb.io.ProcessError as pe: 109 message = 'Bad exit code from HHsearch: #{0.code}.\nSTDERR: {0.stderr}\nSTDOUT: {0.stdout}'.format(pe.context) 110 HHfragApp.exit(message, ExitCodes.HHSEARCH_FAILURE)
111
112 - def log(self, message, ending='\n', level=1):
113 114 if level <= self.args.verbosity: 115 super(HHfragApp, self).log(message, ending)
116
117 -class ArgumentError(ValueError):
118 pass
119
120 -class ArgumentIOError(ArgumentError):
121 pass
122
123 -class InvalidOperationError(ValueError):
124 pass
125
126 127 -class HHfrag(object):
128 129 PDBS = 'pdbs25.hhm' 130
131 - def __init__(self, query, binary, database, logger=None):
132 133 try: 134 self._query = csb.bio.io.HHProfileParser(query).parse() 135 except IOError as io: 136 raise ArgumentIOError(str(io)) 137 self._hsqs = None 138 self._matches = None 139 140 self._app = logger 141 self._database = None 142 self._pdbs25 = None 143 self._output = None 144 self._aligner = None 145 146 self.database = database 147 self.aligner = hhsearch.HHsearch(binary, self.pdbs25, cpu=2)
148 149 @property
150 - def query(self):
151 return self._query
152 153 @property
154 - def pdbs25(self):
155 return self._pdbs25
156 157 @property
158 - def database(self):
159 return self._database
160 @database.setter
161 - def database(self, value):
162 database = value 163 pdbs25 = os.path.join(value, HHfrag.PDBS) 164 if not os.path.isfile(pdbs25): 165 raise ArgumentError('PDBS25 not found here: ' + pdbs25) 166 self._database = database 167 self._pdbs25 = pdbs25
168 169 @property
170 - def aligner(self):
171 return self._aligner
172 @aligner.setter
173 - def aligner(self, value):
174 if hasattr(value, 'run') and hasattr(value, 'runmany'): 175 self._aligner = value 176 else: 177 raise TypeError(value)
178
179 - def log(self, *a, **ka):
180 181 if self._app: 182 self._app.log(*a, **ka)
183
184 - def slice_query(self, min=6, max=21, step=3, cpu=None):
185 186 if not 0 < min <= max: 187 raise ArgumentError('min and max must be positive numbers, with max >= min') 188 if not 0 < step: 189 raise ArgumentError('step must be positive number') 190 191 self.log('\n# Processing profile HMM "{0}"...'.format(self.query.id)) 192 self.log('', level=2) 193 qp = self.query 194 hsqs = [] 195 196 if not cpu: 197 cpu = max - min + 1 198 199 for start in range(1, qp.layers.length - min + 1 + 1, step): 200 201 self.log('{0:3}. '.format(start), ending='', level=1) 202 probes = [] 203 204 for end in range(start + min - 1, start + max): 205 if end > qp.layers.length: 206 break 207 context = SliceContext(qp.segment(start, end), start, end) 208 probes.append(context) 209 210 probes = self.aligner.runmany(probes, workers=cpu) 211 probes.sort() 212 213 if len(probes) > 0: 214 rep = probes[-1] 215 hsqs.append(rep) 216 self.log('{0.start:3} {0.end:3} ({0.length:2} aa) {0.recurrence:3} hits'.format(rep), level=1) 217 else: 218 self.log(' no hits', level=1) 219 220 self._hsqs = hsqs 221 return tuple(hsqs)
222
223 - def extract_fragments(self):
224 225 self.log('\n# Extracting fragments...') 226 227 if self._hsqs is None: 228 raise InvalidOperationError('The query has to be sliced first') 229 230 fragments = [] 231 232 for si in self._hsqs: 233 self.log('\nSEGMENT: {0.start:3} {0.end:3} ({0.recurrence})'.format(si), level=2) 234 235 for hit in si.hits: 236 cn = 0 237 for chunk in hit.alignment.segments: 238 chunk.qstart = chunk.qstart + si.start - 1 239 chunk.qend = chunk.qend + si.start - 1 240 cn += 1 241 self.log(' {0.id:5} L{0.qstart:3} {0.qend:3} {0.length:2}aa P:{0.probability:5.3f}'.format(chunk), ending='', level=2) 242 243 sourcefile = os.path.join(self.database, hit.id + '.pdb') 244 if not os.path.isfile(sourcefile): 245 self.log(' missing', level=2) 246 continue 247 source = csb.bio.io.StructureParser(sourcefile).parse().first_chain 248 assert hit.id[-1] == source.id 249 250 source.compute_torsion() 251 try: 252 fragment = csb.bio.fragments.Assignment(source, 253 chunk.start, chunk.end, hit.id, 254 chunk.qstart, chunk.qend, chunk.probability, 255 rmsd=None, tm_score=None) 256 fragments.append(fragment) 257 if cn > 1: 258 self.log(' (chunk #{0})'.format(cn), level=2) 259 else: 260 self.log('', level=2) 261 262 except csb.bio.structure.Broken3DStructureError: 263 self.log(' corrupt', level=2) 264 continue 265 266 self._matches = fragments 267 return tuple(fragments)
268
269 - def _plot_lengths(self):
270 271 self.log('\n {0} ungapped assignments'.format(len(self._matches))) 272 self.log('', level=2) 273 274 histogram = {} 275 for f in self._matches: 276 histogram[f.length] = histogram.get(f.length, 0) + 1 277 278 for length in sorted(histogram): 279 280 percent = histogram[length] * 100.0 / len(self._matches) 281 bar = '{0:3} |{1} {2:5.1f}%'.format(length, 'o' * int(percent), percent) 282 self.log(bar, level=2)
283
284 - def build_fragment_map(self):
285 286 self.log('\n# Building dynamic fragment map...') 287 288 if self._matches is None: 289 raise InvalidOperationError('You need to extract some fragments first') 290 291 self._plot_lengths() 292 293 target = csb.bio.fragments.Target.from_profile(self.query) 294 target.assignall(self._matches) 295 296 factory = csb.bio.fragments.RosettaFragsetFactory() 297 return factory.make_fragset(target)
298
299 - def _filter_event_handler(self, ri):
300 if ri.rep is None: 301 self.log('{0.rank:3}. {0.confidence:5.3f} {0.count:3} - - -'.format(ri, ri.rep), level=2) 302 else: 303 self.log('{0.rank:3}. {0.confidence:5.3f} {0.count:3} {1.id:5} {1.start:3} {1.end:3}'.format(ri, ri.rep), level=2)
304
305 - def build_filtered_map(self):
306 307 self.log('\n# Building filtered map...') 308 self.log('\n Confidence Count Representative', level=2) 309 310 target = csb.bio.fragments.Target.from_profile(self.query) 311 target.assignall(self._matches) 312 313 factory = csb.bio.fragments.RosettaFragsetFactory() 314 return factory.make_filtered(target, extend=True, 315 callback=self._filter_event_handler)
316
317 - def _merge_event_handler(self, rei):
318 if rei.confidence is None: 319 self.log('{0.rank:3}. - {0.count:3}'.format(rei), level=2) 320 else: 321 self.log('{0.rank:3}. {0.confidence:5.3f} {0.count:3}'.format(rei), level=2)
322
323 - def build_combined_map(self, fragfile, top=25):
324 325 self.log('\n# Building complemented map...') 326 327 try: 328 filling = rosetta.RosettaFragmentMap.read(fragfile, top=top) 329 except IOError as io: 330 raise ArgumentIOError(str(io)) 331 332 self.log('\n {0} rosetta fragments loaded'.format(filling.size)) 333 self.log(' Confidence Count', level=2) 334 335 target = csb.bio.fragments.Target.from_profile(self.query) 336 target.assignall(self._matches) 337 338 factory = csb.bio.fragments.RosettaFragsetFactory() 339 return factory.make_combined(target, filling, threshold=0.5, 340 callback=self._merge_event_handler)
341
342 343 -class SliceContext(hhsearch.Context):
344
345 - def __init__(self, segment, start, end):
346 347 self.start = start 348 self.end = end 349 350 if not isinstance(segment, csb.core.string): 351 segment = segment.to_hmm(convert_scores=True) 352 353 super(SliceContext, self).__init__(segment)
354 355 @property
356 - def length(self):
357 return self.end - self.start + 1
358 359 @property
360 - def hits(self):
361 return self.result
362 363 @property
364 - def recurrence(self):
365 return len(self.result)
366
367 - def __lt__(self, other):
368 369 if self.recurrence == other.recurrence: 370 return self.length < other.length 371 else: 372 return self.recurrence < other.recurrence
373 374 375 376 377 if __name__ == '__main__': 378 379 AppRunner().run() 380