pyiron · freyso · Dec 8, 2023 · Dec 11, 2023 · Dec 11, 2023 · Dec 11, 2023
diff --git a/pyiron_atomistics/sphinx/parser_base.py b/pyiron_atomistics/sphinx/parser_base.py
@@ -0,0 +1,201 @@
+# coding: utf-8
+# Copyright (c) Max-Planck-Institut für Eisenforschung GmbH - Computational Materials Design (CM) Department
+# Distributed under the terms of "New BSD License", see the LICENSE file.
+
+__author__ = "Christoph Freysoldt"
+__copyright__ = (
+    "Copyright 2023, Max-Planck-Institut für Eisenforschung GmbH - "
+    "Computational Materials Design (CM) Department"
+)
+__version__ = "1.0"
+__maintainer__ = "Christoph Freysoldt"
+__email__ = "[email protected]"
+__status__ = "production"
+__date__ = "Dec 8, 2023"
+
+import re
+from types import GeneratorType
+import numpy
+
+
+class KeywordTreeParser:
+    """
+    A base class to parse files block by block via keyword-triggered
+    parsing routines organized in a tree. Parsing routines can
+    add more levels of keyword->parse function maps. The file
+    is read line by line on demand while parsing, so large files will not clobber
+    memory.
+
+    A parser routine can either return or yield (once!) to continue parsing. If it yields,
+    the rest of the routine (after yield) will be executed when the next keyword
+    of the current or a higher level is found.
+
+    Every parser routine MUST remove the keyword from the lineview.
+
+    A typical use will be
+
+    class my_parser(KeywordTreeParser):
+        def __init__(self,file)
+            super ().__init__({
+                    "key1" : self.parse_key1,
+                    "key2" : self.parse_key2  })
+            self.parse (file)
+
+    """
+
+    def __init__(self, keylevels=[]):
+        if isinstance(keylevels, dict):
+            keylevels = [keylevels]
+        elif not isinstance(keylevels, list):
+            raise TypeError
+        self.keylevels = keylevels
+
+    def parse(self, filename):
+        """
+        Parse a file using the current keylevels
+
+        Args:
+            filename ... the filename of the file to parse
+        Returns: nothing
+        """
+        # --- initialization
+        if len(self.keylevels) == 0:
+            raise KeyError("No parsing functions available in keylevels")
+        filehandle = open(filename)
+        # the following properties only exist while parsing
+        self.line = filehandle.__iter__()
+        self.lineview = ""
+        self.filename = filename
+        self.lineno = 0
+        self.line_from = 0
+        while True:
+            for keymap in self.keylevels:
+                for key, func in keymap.items():
+                    if key in self.lineview:
+                        self._cleanup(keymap)
+                        res = func()
+                        if isinstance(res, GeneratorType):
+                            res.send(None)
+                            keymap["%finalize!"] = res
+                        break
+                else:
+                    continue
+                break
+            else:
+                try:
+                    self.lineview = next(self.line)
+                    self.lineno += 1
+                    self.line_from = self.lineno
+                except StopIteration:
+                    break
+        self._cleanup(self.keylevels[0])
+        if hasattr(self, "finalize"):
+            self.finalize()
+        close(filehandle)
+        # clean up object properties that only exist during parsing
+        del (self.filename, self.line, self.lineno, self.line_from, self.lineview)
+
+    def location(self):
+        """Return the current parsing location (for error messages)"""
+        return f"in file '{self.filename}' line" + (
+            f" {self.lineno}"
+            if self.lineno == self.line_from
+            else f"s {self.line_from}..{self.lineno}"
+        )
+
+    def read_until(self, match):
+        """
+        Appends more lines from input until match is found
+
+        Args:
+           match ... (str) what to wait for
+        Returns: nothing
+        """
+        while not match in self.lineview:
+            self.lineview += next(self.line)
+            self.lineno += 1
+            self.line_from = self.line
+
+    def extract_via_regex(self, regex):
+        """
+        Extracts and removes some text from current lineview
+
+        Args:
+           regex ... regular expression
+        Returns:
+           the extracted text
+        """
+        if isinstance(regex, str):
+            regex = re.compile(regex, re.DOTALL)
+        result = regex.search(self.lineview)
+        if result is None:
+            raise RuntimeError(
+                f"Failed to extract '{regex.pattern}' "
+                + self.location()
+                + "\n"
+                + self.lineview
+            )
+        self.lineview = regex.sub("", self.lineview, count=1)
+        return result.group()
+
+    def _cleanup(self, active):
+        """
+        (internal routine) remove levels below the current (active) level, and
+         call (optional) final blocks up to the current level
+
+        Args:
+           active ... the currently active map
+        Returns:
+           the extracted text
+        """
+
+        def try_finalize(keymap):
+            if "%finalize!" in keymap:
+                try:
+                    next(keymap["%finalize!"])
+                except StopIteration:
+                    pass
+                del keymap["%finalize!"]
+
+        # roll back keylevels until active level
+        while self.keylevels[-1] is not active:
+            try_finalize(self.keylevels[-1])
+            del self.keylevels[-1]
+        # and call optional finalize of currently active level
+        try_finalize(active)
+
+    def get_vector(self, key, txt):
+        """
+        (auxiliary function) Get a vector from 'key = [ ... ] ;'
+
+        Args:
+           key ... the key to look for
+        Returns:
+           one-dimensional vector containing the numbers
+        """
+        # get the relevant part between '=' and ';'
+        vecstring = re.sub(".*" + key + r"\s*=\s*([^;]+);.*", r"\1", txt)
+        if vecstring is None:
+            raise RuntimeError(
+                f"Cannot parse {key} from '{txt}' as vector " + self.location()
+            )
+        # remove special characters [] , ; =
+        vecstring = re.sub(r"[][=,;$]", " ", vecstring)
+        return numpy.fromstring(vecstring, sep=" ")
+
+    def extract_var(self, key, startend="=;"):
+        """
+        Extract a block 'key = ... ;'
+
+        If the end pattern is not found in lineview, more lines are read.
+
+        Args:
+            key      ... the keyword
+            startend ... (optional) Override the = ; pair by two different patterns
+        Returns:
+            the extracted block
+        """
+        self.read_until(startend[1])
+        return self.extract_via_regex(
+            key + r"\s*" + startend[0] + r"\s*[^" + startend[1] + "]+" + startend[1]
+        )
diff --git a/pyiron_atomistics/sphinx/structure.py b/pyiron_atomistics/sphinx/structure.py
@@ -2,100 +2,96 @@
 # Copyright (c) Max-Planck-Institut für Eisenforschung GmbH - Computational Materials Design (CM) Department
 # Distributed under the terms of "New BSD License", see the LICENSE file.
 
-from collections import OrderedDict
+import re
 import numpy as np
 import scipy.constants
+from pyiron_atomistics.atomistics.structure.parser_base import KeywordTreeParser
 from pyiron_atomistics.atomistics.structure.atoms import Atoms
 from pyiron_atomistics.atomistics.structure.periodic_table import PeriodicTable
 
-__author__ = "Sudarsan Surendralal, Jan Janssen"
+__author__ = "Christoph Freysoldt"
 __copyright__ = (
-    "Copyright 2021, Max-Planck-Institut für Eisenforschung GmbH - "
+    "Copyright 2023, Max-Planck-Institut für Eisenforschung GmbH - "
     "Computational Materials Design (CM) Department"
 )
-__version__ = "1.0"
-__maintainer__ = "Sudarsan Surendralal"
-__email__ = "surendralal@mpie.de"
+__version__ = "2.0"
+__maintainer__ = "Christoph Freysoldt"
+__email__ = "freysoldt@mpie.de"
 __status__ = "production"
-__date__ = "Feb 4, 2018"
+__date__ = "Dec 8, 2023"
 
 BOHR_TO_ANGSTROM = (
     scipy.constants.physical_constants["Bohr radius"][0] / scipy.constants.angstrom
 )
 
 
+class StructParser(KeywordTreeParser):
+    """
+    This class reads one or more structures in sx format.
+    """
+
+    def __init__(self, file):
+        super().__init__({"structure": self.parse_structure})
+        self.configs = []
+        self.parse(file)
+
+    def parse_structure(self):
+        """Parses structure{} blocks"""
+        self.keylevels.append({"cell": self.parse_cell, "species": self.parse_species})
+        self.extract_via_regex("structure")
+        # --- initialize for next structure
+        self.cell = None
+        self.positions = []
+        self.species = []
+        self.indices = []
+        self.ispecies = -1
+        # continue parsing
+        yield
+        # create Atoms object and append it to configs
+        pse = PeriodicTable()
+        atoms = Atoms(
+            species=[pse.element(s) for s in self.species],
+            indices=self.indices,
+            cell=self.cell * BOHR_TO_ANGSTROM,
+            positions=np.array(self.positions) * BOHR_TO_ANGSTROM,
+            pbc=True,
+        )
+        self.configs.append(atoms)
+
+    def parse_cell(self):
+        """Read the cell"""
+        txt = self.extract_var("cell")
+        self.cell = self.get_vector("cell", txt).reshape(3, 3)
+
+    def parse_species(self):
+        """Parses species{} blocks"""
+        self.extract_via_regex("species")
+        self.keylevels.append({"element": self.get_element, "atom": self.read_atom})
+        self.ispecies += 1
+
+    def get_element(self):
+        """Read element"""
+        txt = self.extract_var("element")
+        self.species.append(re.sub('.*"([^"]*)".*', r"\1", txt))
+
+    def read_atom(self):
+        """Read atomic coordinates from an atom block"""
+        txt = self.extract_var("atom", "{}")
+        self.positions.append(self.get_vector("coords", txt))
+        self.indices.append(self.ispecies)
+        if "label" in txt:
+            label = re.sub(r'.*label\s*=\s*"([^"]+)"\s*;.*', r"\1", txt)
+            print(f"atom {len(self.positions)} label={label}")
+
+
 def read_atoms(filename="structure.sx"):
     """
     Args:
         filename (str): Filename of the sphinx structure file
 
     Returns:
-        pyiron_atomistics.objects.structure.atoms.Atoms instance
+        pyiron_atomistics.objects.structure.atoms.Atoms instance (or a list of them)
 
     """
-    file_string = []
-    with open(filename) as f:
-        for line in f:
-            line = line.strip()
-            file_string.append(line)
-    cell_trigger = "cell"
-    cell_string = list()
-    species_list = list()
-    species_trigger = "element"
-    positions_dict = OrderedDict()
-    positions = list()
-    pse = PeriodicTable()
-    for i, line in enumerate(file_string):
-        if cell_trigger in line:
-            for j in range(len(file_string)):
-                line_str = file_string[i + j]
-                cell_string.append(line_str)
-                if ";" in line_str:
-                    break
-        if species_trigger in line:
-            species = (
-                line.strip().split("=")[-1].replace(";", "").replace('"', "").strip()
-            )
-            species_list.append(pse.element(species))
-            positions_dict[species] = 0
-            for j in range(len(file_string) - i):
-                line_str = file_string[i + j]
-                k = 0
-                if "atom" in line_str:
-                    break_loop = False
-                    while not break_loop:
-                        position_string = " ".join(
-                            file_string[i + j + k].split("=")[-1]
-                        )
-                        replace_list = ["[", "]", ";", "}", "movable", "X", "Y", "Z"]
-                        for rep in replace_list:
-                            position_string = (
-                                "".join(position_string).replace(rep, " ").split()
-                            )
-                        positions.append(
-                            np.array(position_string[0].split(","), dtype=float)
-                        )
-                        positions_dict[species] += 1
-                        k += 1
-                        if (i + j + k) <= len(file_string) - 1:
-                            if (
-                                "element" in file_string[i + j + k]
-                                or "atom" not in file_string[i + j + k]
-                            ):
-                                break_loop = True
-                    break
-    indices = list()
-    for i, val in enumerate(positions_dict.values()):
-        indices.append(np.ones(val, dtype=int) * i)
-    indices = np.hstack(indices)
-    replace_list = ["cell", "=", "[", "]", ",", ";"]
-    for rep in replace_list:
-        cell_string = " ".join(cell_string).replace(rep, " ").split()
-    cell = np.array(cell_string, dtype=float).reshape((3, 3)) * BOHR_TO_ANGSTROM
-    atoms = Atoms(
-        species=species_list,
-        indices=indices,
-        cell=cell,
-        positions=np.array(positions) * BOHR_TO_ANGSTROM,
-    )
-    return atoms
+    configs = StructParser(filename).configs
+    return configs[0] if len(configs) == 1 else configs