Logo Search packages:      
Sourcecode: bkchem version File versions  Download package

inchi-experimental.py

#--------------------------------------------------------------------------
#     This file is part of OASA - a free chemical python library
#     Copyright (C) 2003 Beda Kosata <beda@zirael.org>

#     This program is free software; you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#     the Free Software Foundation; either version 2 of the License, or
#     (at your option) any later version.

#     This program is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#     GNU General Public License for more details.

#     Complete text of GNU GPL can be found in the file gpl.txt in the
#     main directory of the program

#--------------------------------------------------------------------------

from plugin import plugin
from molecule import molecule
import periodic_table as pt
import molfile
import misc

import re
from sets import Set
import operator
import time
import xml.dom.minidom as dom
import dom_extensions
import string
import os.path
import tempfile
import popen2
import coords_generator
from oasa_exceptions import oasa_not_implemented_error
from tempfile import mkstemp


class inchi( plugin):

  name = "inchi"
  read = 1
  write = 0

  proton_donors = ['O','F','Cl','Br','I','N','S']
  proton_acceptors = ['P','N','S','O']
  

  def __init__( self, structure=None):
    self.structure = structure
    self._no_possibility_to_improve = False
    # results of parsing
    self._charge = 0
    self._protonation = 0
    self._movable_hydrogens = []
    #
    self.cleanup()

  def cleanup( self):
    self.forced_charge = 0
    self._protonation_dealt_with_already = 0
    self._added_hs = Set()

  def set_structure( self, structure):
    self.structure = structure

  def get_structure( self):
    return self.structure



  def split_layers( self, text):
    # try if we have a xml document in hand
    try:
      doc = dom.parseString( text)
    except:
      # it seems not to be the case
      return re.split( "/", text.strip())
    structs = doc.getElementsByTagName( 'structure')
    if not structs:
      raise "no structures found in xml string %s" % text
    struct = structs[0]
    layers = []
    for name in ('version','formula','connections','H'):
      try:
        layer = struct.getElementsByTagName( name)[0]
      except IndexError:
        raise "no '%s' tag found in xml string" % name
      layers.append( dom_extensions.getTextFromElement( layer))
    return layers




  def get_layer( self, prefix):
    for l in self.layers:
      if l.startswith( prefix):
        return l[1:]




  def read_inchi( self, text):
    self.structure = molecule()
    self.layers = self.split_layers( text)
    # version support (very crude)
    self.version = self._check_version( self.layers[0])
    if not self.version:
      raise ValueError, "this version of INChI is not supported - %s" % self.layers[0]
    
    self.hs_in_hydrogen_layer = self.get_number_of_hydrogens_in_hydrogen_layer()

    # at first we read all the information from the inchi string
    self.read_sum_layer()
    self.read_connectivity_layer()
    self.read_charge_layer()
    self.read_protonation_layer()
    self.read_hydrogen_layer()

    assert self.put_it_all_together()




  def read_sum_layer( self):
    if "." in self.layers[1]:
      raise oasa_not_implemented_error( "INChI", "multiple compound systems are not supported by the library")

    form = pt.formula_dict( self.layers[1])
    processed_hs = 0 #for diborane and similar compounds we must process some Hs here
    for k in form.sorted_keys():
      for i in range( form[k]):
        if k == 'H':
          # we want to process only the Hs that are not in the h-layer
          if processed_hs >= form[k] - self.hs_in_hydrogen_layer:
            continue
          else:
            processed_hs += 1
        a = self.structure.create_vertex()
        a.symbol = k
        self.structure.add_vertex( a)



  def read_connectivity_layer( self):
    layer = self.get_layer( "c")
    chunks = re.split( "([0-9]*)", layer)
    chunks = filter( None, chunks)
    chunks = filter( lambda x: x!='-', chunks)
    last_atom = None
    bracket_openings = []
    for c in chunks:
      if c == '(':
        bracket_openings.append( last_atom)
      elif c == ')':
        last_atom = bracket_openings.pop(-1)
      elif c == ",":
        last_atom = bracket_openings.pop(-1)
        bracket_openings.append( last_atom)
      else:
        try:
          i = int( c)
        except:
          print "should not happen", c
          continue
        # atom
        if last_atom:
          self.structure.add_edge( last_atom-1, i-1)
        last_atom = i


  def read_hydrogen_layer( self, run=0):
    layer = self.get_layer( "h")
    # check presence of the layer
    if not layer:
      return

    re_for_brackets = "\([H\d,\-]+?\)"
    brackets = re.findall( re_for_brackets, layer)
    for bracket in brackets:
      self._process_moving_hydrogen( bracket[1:-1], run=run)

    # we improve only in the _process_moving_hydrogen so if it was not called there is no possibility for improvement
    if not self._added_hs:
      self._no_possibility_to_improve = True
      
    layer = re.sub( re_for_brackets, "", layer)  # clean the brackets out

    for vs, num in self._parse_h_layer( layer):
      for v in vs:
        for j in range( num):
          h = self.structure.create_vertex()
          h.symbol = 'H'
          self.structure.add_vertex( h)
          self.structure.add_edge( v-1, h)
          self._added_hs.add( h)



  def read_charge_layer( self):
    layer = self.get_layer( "q")
    if not layer:
      return

    self._charge = int( layer[1:])



  def read_protonation_layer( self):
    layer = self.get_layer( "p")
    if not layer:
      return

    self._protonation = int( layer[1:])
    self._charge += self._protonation



  def _valency_to_charge( self, v, charge):
    """sets charge of the atom so that it has minimal free_valency,
    returns the 'unused' charge in case it is not comsumed whole"""
    ch = misc.signum( charge) * min( abs( charge), v.free_valency)
    if charge < 0:
      charge += ch
      v.charge = -ch
    else:
      charge -= ch
      v.charge = ch
    return charge

        

  def get_number_of_hydrogens_in_hydrogen_layer( self):
    # version check
    layer = self.get_layer( "h")
    if not layer:
      return 0

    # check if we can handle it
    if "*" in layer or ";" in layer:
      raise oasa_not_implemented_error( "INChI", "multiple compound systems are not supported by the library")

    ret = 0

    re_for_brackets = "\([H\d,\-]+?\)"
    brackets = re.findall( re_for_brackets, layer)
    for bracket in brackets:
      ret += self._get_hs_in_moving_hydrogen( bracket[1:-1])
    layer = re.sub( re_for_brackets, "", layer)  # clean the brackets out

    for vs, num in self._parse_h_layer( layer):
      ret += len( vs) * num

    return ret


  def _get_hs_in_moving_hydrogen( self, chunk):
    chks = chunk.split( ',')
    if len( chks[0]) > 1:
      if chks[0][1:].endswith( "-"):
        hs = int( chks[0][1:-1] or 1)  # number of atoms 
      else:
        hs = int( chks[0][1:])  # number of atoms
    else:
      hs = 1
    return hs
    


  def _check_version( self, ver):
    """returns a tuple of two version numbers"""
    if "Beta" in ver:
      return None
    v = ver.strip(string.ascii_lowercase+string.ascii_uppercase)
    if "." in v:
      return v.split('.')
    else:
      return v, 0


  def _process_moving_hydrogen( self, chunk, run=0):
    chks = chunk.split( ',')
    if len( chks[0]) > 1:
      if chks[0][1:].endswith("-"):
        hs = int( chks[0][1:-1] or 1) + 1  # number of atoms 
        self._protonation_dealt_with_already += 1
      else:
        hs = int( chks[0][1:])  # number of atoms
    else:
      hs = 1

    vertices = [self.structure.vertices[ int( i)-1] for i in chks[1:]]
    self._movable_hydrogens.append( [hs, vertices])




  def _split_h_layer( self, layer):
    was_h = False
    chunks = []
    chunk = ""
    for c in layer:
      if c == 'H':
        was_h = True
        chunk += c
      elif c == "," and was_h:
        was_h = False
        chunks.append( chunk)
        chunk = ""
      else:
        chunk += c
    if chunk:
      chunks.append( chunk)
    return chunks


  def _parse_h_layer( self, layer):
    chunks = self._split_h_layer( layer)
    for chunk in chunks:
      head, tail = chunk.split( 'H')
      num_h = tail and int( tail) or 1
      vertices = []
      for p in head.split( ","):
        if "-" in p:
          a, b = map( int, p.split("-"))
          vertices.extend( range( a, b+1))
        else:
          vertices.append( int( p))
      yield vertices, num_h



  def process_forced_charges( self):
    """this marks the charges that are forced by the connectivity and thus helps
    process zwitrions"""
    forced_charge = 0
    for v in self.structure.vertices:
      if v.symbol in ("N",) and v.free_valency == -1:
        v.charge = 1
        forced_charge += 1
        
    self.forced_charge = forced_charge


  def put_it_all_together( self):
    self.process_forced_charges()
    for a in self._gen_all_charge_positions():
      for b in self._gen_all_movable_hydrogen_positions():
        self._add_missing_bond_orders()
        if not filter( None, [v.free_valency for v in self.structure.vertices]) and \
               not filter( None, [not v.order for v in self.structure.edges]):
          return True
        for b in self.structure.bonds:
          b.order = 1
        for v in self.structure.vertices:
          v.symbol = v.symbol
    return False



  def _gen_all_charge_positions( self):
    if self._charge < 0:
      vs = [v for v in self.structure.vertices if v.symbol in self.proton_donors]
      n = -self._charge
      addition = -1
    else:
      vs = [v for v in self.structure.vertices if v.symbol in self.proton_acceptors]
      n = self._charge
      addition = 1

    for var in misc.gen_variations( vs, n):
      for v in var:
        v.charge += addition
      yield 1 
      for v in var:
        v.charge = 0
     


  def _gen_all_movable_hydrogen_positions( self):
    gens = [self._gen_all_movable_hydrogen_positions_for_one_chunk( vs, n) for n, vs in self._movable_hydrogens]
    if len( gens) == 0:
      yield 1
      raise StopIteration
    if len( gens) == 1:
      for gen in gens[0]:
        yield 1
    elif len( gens) == 2:
      for gen in gens[0]:
        for gen2 in gens[1]:
          yield 1
    else:
      raise ValueError( "shit, too many movable hydrogen chunks " + str( len( gens)))
      


  def _gen_all_movable_hydrogen_positions_for_one_chunk( self, vs, n):
    _added_hs = Set()
    for i in range( n+1, 1, -1):
      for variation in misc.gen_variations( vs, i):
        # cleanup
        for h in _added_hs:
          self.structure.remove( h)
        # the code itself
        _added_hs = Set()
        hs = n
        while hs:
          for v in variation:
            h = self.structure.create_vertex()
            h.symbol = 'H'
            self.structure.add_vertex( h)
            self.structure.add_edge( v, h)
            self._added_hs.add( h)
            hs -= 1
            if not hs:
              break
        yield 1
      
  def _add_missing_bond_orders( self):
    [v.raise_valency_to_senseful_value() for v in self.structure.vertices if v.free_valency < 0]

    fix = True
    while fix:
      fix = False
      for v in self.structure.vertices:
        if sum( [n.free_valency for n in v.neighbors]) < v.free_valency:
          for n in v.neighbors:
            if n.raise_valency():
              fix = True
              break
        if fix:
          break

    [self._add_missing_bond_orders_to_one_chunk( comp) for comp in self.structure._gen_free_valency_connected_components()]
    
      

  def _add_missing_bond_orders_to_one_chunk( self, comp):
    if len( comp) % 2:
      done = False
      # the chunk has odd number of free_valency atoms, we have create some more
      for v in comp:
        for n in v.neighbors:
          if n.free_valency == 0:
            done = n.raise_valency_to_senseful_value()
            if done:
              comp.add( n)
              break
        if done:
          break
            
    # here comes the code itself
    es = self.structure.vertex_subgraph_to_edge_subgraph( comp)
    if len( es) >= len( comp):
      # there is a ring in the there
      e = list( es)[0]
      self._maximize_bond_order( es, e)
      if filter( None, [v.free_valency for v in self.structure.vertices]) or \
             filter( None, [not v.order for v in self.structure.edges]):
        # cleanup
        for b in es:
          b.order = 1
        e = [x for x in e.get_neighbor_edges() if x in es][0]
        self._maximize_bond_order( es, e)
    else:
      # no ring - we simply start at one end and go further
      for v in comp:
        if len( [n for n in v.neighbors if n in comp]) == 1:
          # we have an end
          break
      ende = (es & Set( v.get_neighbor_edges())).pop()
      self._maximize_bond_order( es, ende)



  def _maximize_bond_order( self, bonds, bond):
    bond.order += min( [v.free_valency for v in bond.vertices])
    v1, v2 = bond.vertices
    bonds = bonds - Set([bond])
    to_go = bonds & (Set( [b for b in v1.get_neighbor_edges()]) | Set( [b for b in v1.get_neighbor_edges()]))
    
    [self._maximize_bond_order( bonds, b) for b in to_go]



def generate_inchi( m):
  program = "/home/beda/inchi/cInChI-1"

  f, name = mkstemp( text=True)
  os.close( f)
  
  file = open( name, "w")
  molfile.mol_to_file( m, file)
  file.close()

  f, in_name = mkstemp( text=True)
  os.close( f)

  if os.name == 'nt':
    options = "/AUXNONE"
  else:
    options = "-AUXNONE"
  command = ' '.join( (program, name, in_name, options))
  popen = popen2.Popen4( command)
  exit_code = popen.wait()
  #exit_code = os.spawnv( os.P_WAIT, program, (program, name, in_name, options))

  if exit_code == 0 or exit_code != 0:
    in_file = open( in_name, 'r')
    # go to the last line
    i = 0
    for line in in_file.readlines():
      i += 1
    if i > 1:
      out = ( line[6:].strip())
    else:
      # single line file is the only way how to determine it has crashed
      out = ""
    in_file.close()
  else:
    out = ''

  os.remove( in_name)
  os.remove( name)

  return out

    




##################################################
# MODULE INTERFACE

reads_text = 1
reads_files = 1
writes_text = 1
writes_files = 1

def text_to_mol( text, include_hydrogens=True, mark_aromatic_bonds=False, calc_coords=1):
  inc = inchi()
  inc.read_inchi( text)
  mol = inc.structure
  #mol.add_missing_bond_orders()
  if not include_hydrogens:
    mol.remove_all_hydrogens()
  if calc_coords:
    coords_generator.calculate_coords( mol)
  if mark_aromatic_bonds:
    mol.mark_aromatic_bonds()
  return mol


def file_to_mol( f):
  return text_to_mol( f.read())


def mol_to_text( mol):
  return generate_inchi( mol)


def mol_to_file( mol, f):
  f.write( mol_to_text( mol))



#
##################################################




##################################################
# DEMO

if __name__ == '__main__':

  import smiles

  def main( text, cycles):
    t1 = time.time()
    for jj in range( cycles):
      mol = text_to_mol( text)
      print map( str, [b for b in mol.bonds if b.order == 0])
      print "  smiles: ", smiles.mol_to_text( mol)
      print "  inchi:  ", mol_to_text( mol)
      print "  charge: ", sum( [a.charge for a in mol.vertices])
    t1 = time.time() - t1
    print 'time per cycle', round( 1000*t1/cycles, 2), 'ms'

  repeat = 3
  inch = "1/C12H22NO/c1-13(2,3)11-7-10-12(14)8-5-4-6-9-12/h14H,4-6,8-9,11H2,1-3H3/q+1" #"1/C6H6N2O3S/c1-3(9)5-2-4(8(10)11)6(7)12-5/h2H,7H2,1H3" #"1/C4H6N2O3S/c1-5-2-3-6(4-5)10(7,8)9/h2-4H,1H3/p+1"
  print "oasa::INCHI DEMO"
  print "converting following inchi into smiles (%d times)" % repeat
  print "  inchi: %s" % inch
  
  #import profile
  #profile.run( 'main( inch, repeat)')
  main( inch, repeat)

# DEMO END
##################################################



##################################################
## TODO

## rename layer to sublayer


##################################################

Generated by  Doxygen 1.6.0   Back to index