Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

new sx structure parser that tolerates unexpected items #1253

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
201 changes: 201 additions & 0 deletions pyiron_atomistics/sphinx/parser_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
# coding: utf-8
# Copyright (c) Max-Planck-Institut für Eisenforschung GmbH - Computational Materials Design (CM) Department
# Distributed under the terms of "New BSD License", see the LICENSE file.

__author__ = "Christoph Freysoldt"
__copyright__ = (
"Copyright 2023, Max-Planck-Institut für Eisenforschung GmbH - "
"Computational Materials Design (CM) Department"
)
__version__ = "1.0"
__maintainer__ = "Christoph Freysoldt"
__email__ = "[email protected]"
__status__ = "production"
__date__ = "Dec 8, 2023"

import re
from types import GeneratorType
import numpy
freyso marked this conversation as resolved.
Show resolved Hide resolved


class KeywordTreeParser:
"""
A base class to parse files block by block via keyword-triggered
parsing routines organized in a tree. Parsing routines can
add more levels of keyword->parse function maps. The file
is read line by line on demand while parsing, so large files will not clobber
memory.

A parser routine can either return or yield (once!) to continue parsing. If it yields,
the rest of the routine (after yield) will be executed when the next keyword
of the current or a higher level is found.

Every parser routine MUST remove the keyword from the lineview.

A typical use will be

class my_parser(KeywordTreeParser):
def __init__(self,file)
super ().__init__({
"key1" : self.parse_key1,
"key2" : self.parse_key2 })
self.parse (file)

"""

def __init__(self, keylevels=[]):
if isinstance(keylevels, dict):
keylevels = [keylevels]
elif not isinstance(keylevels, list):
raise TypeError
self.keylevels = keylevels

def parse(self, filename):
"""
Parse a file using the current keylevels

Args:
filename ... the filename of the file to parse
Returns: nothing
"""
# --- initialization
if len(self.keylevels) == 0:
raise KeyError("No parsing functions available in keylevels")
filehandle = open(filename)
# the following properties only exist while parsing
self.line = filehandle.__iter__()
self.lineview = ""
self.filename = filename
self.lineno = 0
self.line_from = 0
while True:
for keymap in self.keylevels:
for key, func in keymap.items():
if key in self.lineview:
self._cleanup(keymap)
res = func()
if isinstance(res, GeneratorType):
res.send(None)
keymap["%finalize!"] = res
break
samwaseda marked this conversation as resolved.
Show resolved Hide resolved
else:
continue
break
else:
try:
self.lineview = next(self.line)
self.lineno += 1
self.line_from = self.lineno
except StopIteration:
break
self._cleanup(self.keylevels[0])
if hasattr(self, "finalize"):
self.finalize()
close(filehandle)
# clean up object properties that only exist during parsing
del (self.filename, self.line, self.lineno, self.line_from, self.lineview)

def location(self):
"""Return the current parsing location (for error messages)"""
return f"in file '{self.filename}' line" + (
f" {self.lineno}"
if self.lineno == self.line_from
else f"s {self.line_from}..{self.lineno}"
)

def read_until(self, match):
"""
Appends more lines from input until match is found

Args:
match ... (str) what to wait for
Returns: nothing
"""
while not match in self.lineview:
self.lineview += next(self.line)
self.lineno += 1
self.line_from = self.line

def extract_via_regex(self, regex):
"""
Extracts and removes some text from current lineview

Args:
regex ... regular expression
Returns:
the extracted text
"""
if isinstance(regex, str):
regex = re.compile(regex, re.DOTALL)
result = regex.search(self.lineview)
if result is None:
raise RuntimeError(
f"Failed to extract '{regex.pattern}' "
+ self.location()
+ "\n"
+ self.lineview
)
self.lineview = regex.sub("", self.lineview, count=1)
return result.group()

def _cleanup(self, active):
"""
(internal routine) remove levels below the current (active) level, and
call (optional) final blocks up to the current level

Args:
active ... the currently active map
Returns:
the extracted text
"""

def try_finalize(keymap):
if "%finalize!" in keymap:
try:
next(keymap["%finalize!"])
except StopIteration:
pass
del keymap["%finalize!"]

# roll back keylevels until active level
while self.keylevels[-1] is not active:
try_finalize(self.keylevels[-1])
del self.keylevels[-1]
# and call optional finalize of currently active level
try_finalize(active)

def get_vector(self, key, txt):
"""
(auxiliary function) Get a vector from 'key = [ ... ] ;'

Args:
key ... the key to look for
Returns:
one-dimensional vector containing the numbers
"""
# get the relevant part between '=' and ';'
vecstring = re.sub(".*" + key + r"\s*=\s*([^;]+);.*", r"\1", txt)
if vecstring is None:
raise RuntimeError(
f"Cannot parse {key} from '{txt}' as vector " + self.location()
)
# remove special characters [] , ; =
vecstring = re.sub(r"[][=,;$]", " ", vecstring)
return numpy.fromstring(vecstring, sep=" ")

def extract_var(self, key, startend="=;"):
"""
Extract a block 'key = ... ;'

If the end pattern is not found in lineview, more lines are read.

Args:
key ... the keyword
startend ... (optional) Override the = ; pair by two different patterns
Returns:
the extracted block
"""
self.read_until(startend[1])
return self.extract_via_regex(
key + r"\s*" + startend[0] + r"\s*[^" + startend[1] + "]+" + startend[1]
)
144 changes: 70 additions & 74 deletions pyiron_atomistics/sphinx/structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,100 +2,96 @@
# Copyright (c) Max-Planck-Institut für Eisenforschung GmbH - Computational Materials Design (CM) Department
# Distributed under the terms of "New BSD License", see the LICENSE file.

from collections import OrderedDict
import re
import numpy as np
import scipy.constants
from pyiron_atomistics.atomistics.structure.parser_base import KeywordTreeParser
from pyiron_atomistics.atomistics.structure.atoms import Atoms
from pyiron_atomistics.atomistics.structure.periodic_table import PeriodicTable

__author__ = "Sudarsan Surendralal, Jan Janssen"
__author__ = "Christoph Freysoldt"
__copyright__ = (
"Copyright 2021, Max-Planck-Institut für Eisenforschung GmbH - "
"Copyright 2023, Max-Planck-Institut für Eisenforschung GmbH - "
"Computational Materials Design (CM) Department"
)
__version__ = "1.0"
__maintainer__ = "Sudarsan Surendralal"
__email__ = "surendralal@mpie.de"
__version__ = "2.0"
__maintainer__ = "Christoph Freysoldt"
__email__ = "freysoldt@mpie.de"
__status__ = "production"
__date__ = "Feb 4, 2018"
__date__ = "Dec 8, 2023"

BOHR_TO_ANGSTROM = (
scipy.constants.physical_constants["Bohr radius"][0] / scipy.constants.angstrom
)


class StructParser(KeywordTreeParser):
"""
This class reads one or more structures in sx format.
"""

def __init__(self, file):
super().__init__({"structure": self.parse_structure})
self.configs = []
self.parse(file)

def parse_structure(self):
"""Parses structure{} blocks"""
self.keylevels.append({"cell": self.parse_cell, "species": self.parse_species})
self.extract_via_regex("structure")
# --- initialize for next structure
self.cell = None
self.positions = []
self.species = []
self.indices = []
self.ispecies = -1
# continue parsing
yield
# create Atoms object and append it to configs
pse = PeriodicTable()
atoms = Atoms(
species=[pse.element(s) for s in self.species],
indices=self.indices,
cell=self.cell * BOHR_TO_ANGSTROM,
positions=np.array(self.positions) * BOHR_TO_ANGSTROM,
pbc=True,
)
self.configs.append(atoms)

def parse_cell(self):
"""Read the cell"""
txt = self.extract_var("cell")
self.cell = self.get_vector("cell", txt).reshape(3, 3)

def parse_species(self):
"""Parses species{} blocks"""
self.extract_via_regex("species")
self.keylevels.append({"element": self.get_element, "atom": self.read_atom})
self.ispecies += 1

def get_element(self):
"""Read element"""
txt = self.extract_var("element")
self.species.append(re.sub('.*"([^"]*)".*', r"\1", txt))

def read_atom(self):
"""Read atomic coordinates from an atom block"""
txt = self.extract_var("atom", "{}")
self.positions.append(self.get_vector("coords", txt))
self.indices.append(self.ispecies)
if "label" in txt:
label = re.sub(r'.*label\s*=\s*"([^"]+)"\s*;.*', r"\1", txt)
print(f"atom {len(self.positions)} label={label}")


def read_atoms(filename="structure.sx"):
"""
Args:
filename (str): Filename of the sphinx structure file

Returns:
pyiron_atomistics.objects.structure.atoms.Atoms instance
pyiron_atomistics.objects.structure.atoms.Atoms instance (or a list of them)

"""
file_string = []
with open(filename) as f:
for line in f:
line = line.strip()
file_string.append(line)
cell_trigger = "cell"
cell_string = list()
species_list = list()
species_trigger = "element"
positions_dict = OrderedDict()
positions = list()
pse = PeriodicTable()
for i, line in enumerate(file_string):
if cell_trigger in line:
for j in range(len(file_string)):
line_str = file_string[i + j]
cell_string.append(line_str)
if ";" in line_str:
break
if species_trigger in line:
species = (
line.strip().split("=")[-1].replace(";", "").replace('"', "").strip()
)
species_list.append(pse.element(species))
positions_dict[species] = 0
for j in range(len(file_string) - i):
line_str = file_string[i + j]
k = 0
if "atom" in line_str:
break_loop = False
while not break_loop:
position_string = " ".join(
file_string[i + j + k].split("=")[-1]
)
replace_list = ["[", "]", ";", "}", "movable", "X", "Y", "Z"]
for rep in replace_list:
position_string = (
"".join(position_string).replace(rep, " ").split()
)
positions.append(
np.array(position_string[0].split(","), dtype=float)
)
positions_dict[species] += 1
k += 1
if (i + j + k) <= len(file_string) - 1:
if (
"element" in file_string[i + j + k]
or "atom" not in file_string[i + j + k]
):
break_loop = True
break
indices = list()
for i, val in enumerate(positions_dict.values()):
indices.append(np.ones(val, dtype=int) * i)
indices = np.hstack(indices)
replace_list = ["cell", "=", "[", "]", ",", ";"]
for rep in replace_list:
cell_string = " ".join(cell_string).replace(rep, " ").split()
cell = np.array(cell_string, dtype=float).reshape((3, 3)) * BOHR_TO_ANGSTROM
atoms = Atoms(
species=species_list,
indices=indices,
cell=cell,
positions=np.array(positions) * BOHR_TO_ANGSTROM,
)
return atoms
configs = StructParser(filename).configs
return configs[0] if len(configs) == 1 else configs
Loading