fix: support xyz format (#669)

* fix: support xyz format * fix: support nopbc in command line
deepmodeling · Apr 1, 2020 · 4d5c168 · 4d5c168 · vercel · Apr 1, 2020
1 parent a055f0f
commit 4d5c168
Show file tree

Hide file tree

Showing 3 changed files with 105 additions and 53 deletions.
diff --git a/reacnetgenerator/_detect.py b/reacnetgenerator/_detect.py
@@ -45,7 +45,8 @@ class _Detect(SharedRNGData, metaclass=ABCMeta):
     subclasses = {}
 
     def __init__(self, rng):
-        SharedRNGData.__init__(self, rng, ['inputfilename', 'atomname', 'stepinterval', 'nproc', 'pbc'],
+        SharedRNGData.__init__(self, rng, ['inputfilename', 'atomname', 'stepinterval', 'nproc', 'pbc',
+                                           'cell'],
                                ['N', 'atomtype', 'step', 'timestep', 'temp1it', 'moleculetempfilename'])
 
     @classmethod
@@ -155,18 +156,66 @@ def _readstepfunc(self, item):
                     level[s0] = map(self._get_bo, s[4+s2:4+2*s2])
         molecules = self._connectmolecule(bond, level)
         return molecules, (step, timestep)
-    
+
     @staticmethod
     def _get_idx(x):
         return int(x) - 1
-    
+
     @staticmethod
     def _get_bo(x):
         return max(1, round(float(x)))
 
 
+class _DetectCrd(_Detect):
+    def _getbondfromcrd(self, step_atoms, cell):
+        atomnumber = len(step_atoms)
+        if self.pbc:
+            # Apply period boundry conditions
+            step_atoms.set_pbc(True)
+            step_atoms.set_cell(cell)
+            # add ghost atoms
+            repeated_atoms = step_atoms.repeat(2)[atomnumber:]
+            tree = cKDTree(step_atoms.get_positions())
+            d = tree.query(repeated_atoms.get_positions(), k=1)[0]
+            nearest = d < 5
+            ghost_atoms = repeated_atoms[nearest]
+            realnumber = np.where(nearest)[0] % atomnumber
+            step_atoms += ghost_atoms
+        # Use openbabel to connect atoms
+        mol = openbabel.OBMol()
+        mol.BeginModify()
+        for idx, (num, position) in enumerate(zip(step_atoms.get_atomic_numbers(), step_atoms.positions)):
+            a = mol.NewAtom(idx)
+            a.SetAtomicNum(int(num))
+            a.SetVector(*position)
+        mol.ConnectTheDots()
+        mol.PerceiveBondOrders()
+        mol.EndModify()
+        bond = [[] for i in range(atomnumber)]
+        bondlevel = [[] for i in range(atomnumber)]
+        for b in openbabel.OBMolBondIter(mol):
+            s1 = b.GetBeginAtom().GetId()
+            s2 = b.GetEndAtom().GetId()
+            if s1 >= atomnumber and s2 >= atomnumber:
+                # duplicated
+                continue
+            elif s1 >= atomnumber:
+                s1 = realnumber[s1-atomnumber]
+            elif s2 >= atomnumber:
+                s2 = realnumber[s2-atomnumber]
+            level = b.GetBondOrder()
+            if level == 5:
+                # aromatic, 5 in openbabel but 12 in rdkit
+                level = 12
+            bond[s1].append(s2)
+            bond[s2].append(s1)
+            bondlevel[s1].append(level)
+            bondlevel[s2].append(level)
+        return bond, bondlevel
+
+
 @_Detect.register_subclass("lammpsdumpfile")
-class _DetectLAMMPSdump(_Detect):
+class _DetectLAMMPSdump(_DetectCrd):
     class LineType(Enum):
         """Line type in the LAMMPS dump files."""
 
@@ -247,48 +296,40 @@ def _readstepfunc(self, item):
         molecules = self._connectmolecule(bond, level)
         return molecules, timestep
 
-    def _getbondfromcrd(self, step_atoms, cell):
-        atomnumber = len(step_atoms)
-        if self.pbc:
-            # Apply period boundry conditions
-            step_atoms.set_pbc(True)
-            step_atoms.set_cell(cell)
-            # add ghost atoms
-            repeated_atoms = step_atoms.repeat(2)[atomnumber:]
-            tree = cKDTree(step_atoms.get_positions())
-            d = tree.query(repeated_atoms.get_positions(), k=1)[0]
-            nearest = d < 5
-            ghost_atoms = repeated_atoms[nearest]
-            realnumber = np.where(nearest)[0] % atomnumber
-            step_atoms += ghost_atoms
-        # Use openbabel to connect atoms
-        mol = openbabel.OBMol()
-        mol.BeginModify()
-        for idx, (num, position) in enumerate(zip(step_atoms.get_atomic_numbers(), step_atoms.positions)):
-            a = mol.NewAtom(idx)
-            a.SetAtomicNum(int(num))
-            a.SetVector(*position)
-        mol.ConnectTheDots()
-        mol.PerceiveBondOrders()
-        mol.EndModify()
-        bond = [[] for i in range(atomnumber)]
-        bondlevel = [[] for i in range(atomnumber)]
-        for b in openbabel.OBMolBondIter(mol):
-            s1 = b.GetBeginAtom().GetId()
-            s2 = b.GetEndAtom().GetId()
-            if s1 >= atomnumber and s2 >= atomnumber:
-                # duplicated
-                continue
-            elif s1 >= atomnumber:
-                s1 = realnumber[s1-atomnumber]
-            elif s2 >= atomnumber:
-                s2 = realnumber[s2-atomnumber]
-            level = b.GetBondOrder()
-            if level == 5:
-                # aromatic, 5 in openbabel but 12 in rdkit
-                level = 12
-            bond[s1].append(s2)
-            bond[s2].append(s1)
-            bondlevel[s1].append(level)
-            bondlevel[s2].append(level)
-        return bond, bondlevel
+
+@_Detect.register_subclass("xyz")
+class _Detectxyz(_DetectCrd):
+    """xyz file. Two frames are connected. Cell information must be inputed by user."""
+
+    def _readNfunc(self, f):
+        atomname_dict = dict(zip(self.atomname.tolist(), range(self.atomname.size)))
+        for index, line in enumerate(f):
+            s = line.split()
+            if index == 0:
+                N = int(line.strip())
+                atomtype = np.zeros(N, dtype=int)
+            elif index > N+1:
+                break
+            elif index > 1:
+                atomtype[index-2] = atomname_dict[s[0]]
+        self.N = N
+        self.atomtype = atomtype
+        steplinenum = N + 2
+        return steplinenum
+
+    def _readstepfunc(self, item):
+        step, lines = item
+        timestep = step, step
+        step_atoms = []
+        boxsize = self.cell
+        if self.pbc and boxsize is None:
+            raise RuntimeError("No cell information is given.")
+        for index, line in enumerate(lines):
+            s = line.split()
+            if index > 1:
+                step_atoms.append((index-1, Atom(s[0] ,tuple((float(x) for x in s[1:4])))))
+        _, step_atoms = zip(*sorted(step_atoms, key=operator.itemgetter(0)))
+        step_atoms = Atoms(step_atoms)
+        bond, level = self._getbondfromcrd(step_atoms, boxsize)
+        molecules = self._connectmolecule(bond, level)
+        return molecules, timestep
diff --git a/reacnetgenerator/commandline.py b/reacnetgenerator/commandline.py
@@ -17,6 +17,12 @@ def _commandline():
         action="store_true")
     parser.add_argument(
         '--dump', help='Process the LAMMPS dump file', action="store_true")
+    parser.add_argument(
+        '--type', '-t', help='Input file type', default='lammpsbondfile')
+    parser.add_argument(
+        '--nopbc', help='Disable PBC.', action="store_true")
+    parser.add_argument(
+        '--cell', '-c', nargs=3, type=float, help='Cell')
     parser.add_argument(
         '-n', '-np', '--nproc', help='Number of processes', type=int)
     parser.add_argument(
@@ -39,7 +45,7 @@ def _commandline():
     ReacNetGenerator(
         inputfilename=args.inputfilename, atomname=args.atomname,
         runHMM=not args.nohmm,
-        inputfiletype=('lammpsdumpfile' if args.dump else 'lammpsbondfile'),
+        inputfiletype=('lammpsdumpfile' if args.dump else args.type),
         nproc=args.nproc, selectatoms=args.selectatoms,
         stepinterval=args.stepinterval,
         split=args.split,
@@ -48,6 +54,8 @@ def _commandline():
               for url in args.urls] if args.urls else None,
         a=np.array(args.matrixa).reshape((2, 2)),
         b=np.array(args.matrixb).reshape((2, 2)),
+        pbc=not args.nopbc,
+        cell=args.cell,
     ).runanddraw()
 
 
@@ -56,8 +64,8 @@ def parm2cmd(pp):
                 pp['inputfilename'], '-a', *pp['atomname']]
     if not pp.get('runHMM', True):
         commands.append('--nohmm')
-    if pp['inputfiletype'] == 'lammpsdumpfile':
-        commands.append('--dump')
+    if pp['inputfiletype']:
+        commands.extend(('--type', pp['inputfiletype']))
     if pp['atomname']:
         commands.extend(('-s', pp['atomname'][0]))
     if pp.get('urls', []):
@@ -69,6 +77,9 @@ def parm2cmd(pp):
     if pp.get('b', []):
         commands.extend(
             ('--matrixb', str(pp['b'][0][0]), str(pp['b'][0][1]), str(pp['b'][1][0]), str(pp['b'][1][1])))
+    if pp.get('cell', []):
+        commands.extend(
+            ('--cell', str(pp['cell'][0]), str(pp['cell'][1]), str(pp['cell'][2])))
     for ii in ['nproc', 'selectatoms', 'stepinterval', 'split', 'maxspecies']:
         if pp.get(ii, None):
             commands.extend(("--{}".format(ii), str(pp[ii])))

diff --git a/reacnetgenerator/reacnetgen.py b/reacnetgenerator/reacnetgen.py
@@ -80,7 +80,7 @@ def __init__(self, **kwargs):
                             "nolabel": False,  "printfiltersignal": False, "showid": True, "runHMM": True, "SMILES": True,
                             "getoriginfile": False, "needprintspecies": True, "urls": []
                             }
-        none_key = ['selectatoms', 'species', 'pos', 'k', 'speciescenter']
+        none_key = ['selectatoms', 'species', 'pos', 'k', 'speciescenter', 'cell']
         accept_keys = ['atomtype', 'step', 'hmmit', 'timestep', 'steplinenum', 'N',
             'temp1it', 'originfilename', 'hmmfilename', 'moleculetempfilename', 'moleculetemp2filename',
             'allmoleculeroute', 'splitmoleculeroute']