Source code for pasted._io

"""
pasted._io
==========
XYZ format serialisation helpers.
"""

from __future__ import annotations

import math
import re
from collections import Counter
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from ._placement import Vec3


def _fmt(v: float) -> str:
    """Format a metric value: ``nan`` as the string ``'nan'``, else 4 d.p."""
    return "nan" if math.isnan(v) else f"{v:.4f}"



[docs]
def format_xyz(
    atoms: list[str],
    positions: list[Vec3],
    charge: int,
    mult: int,
    metrics: dict[str, float],
    prefix: str = "",
) -> str:
    """Serialise a structure to the extended XYZ format.

    The second line (comment line) encodes *prefix*, charge, multiplicity,
    composition, and all metric values.

    Parameters
    ----------
    atoms:
        Element symbols.
    positions:
        Cartesian coordinates (Å), one per atom.
    charge:
        Total system charge.
    mult:
        Spin multiplicity 2S+1.
    metrics:
        Dict of computed disorder metrics.
    prefix:
        Prepended to the comment line (e.g. ``"sample=1 mode=gas"``).

    Returns
    -------
    A multi-line string (no trailing newline).
    """
    comp = ",".join(f"{s}:{c}" for s, c in sorted(Counter(atoms).items()))
    metric_str = "  ".join(f"{k}={_fmt(v)}" for k, v in metrics.items())
    comment = (f"{prefix} charge={charge:+d} mult={mult} comp=[{comp}]  {metric_str}").strip()
    lines = [str(len(atoms)), comment]
    for atom, (x, y, z) in zip(atoms, positions, strict=False):
        lines.append(f"{atom:<4s}  {x:12.6f}  {y:12.6f}  {z:12.6f}")
    return "\n".join(lines)




[docs]
def parse_xyz(text: str) -> list[tuple[list[str], list[Vec3], int, int, dict[str, float]]]:
    """Parse a (possibly multi-frame) XYZ string — standard or extended format.

    Supports both:

    * **Standard XYZ** — atom count line, comment line, then coordinate lines.
      ``charge`` defaults to 0, ``mult`` to 1, ``metrics`` is empty.
    * **Extended XYZ** (as written by PASTED) — the comment line may contain
      ``charge=+0``, ``mult=1``, and ``KEY=VALUE`` metric tokens.

    Parameters
    ----------
    text:
        Full contents of one or more XYZ frames (concatenated).

    Returns
    -------
    list of ``(atoms, positions, charge, mult, metrics)`` tuples, one per frame.

    Raises
    ------
    ValueError
        When the atom-count line or a coordinate line cannot be parsed.
    """
    frames = []
    lines = text.splitlines()
    i = 0
    while i < len(lines):
        # Skip blank lines between frames
        if not lines[i].strip():
            i += 1
            continue

        # --- atom count line ---
        try:
            n_atoms = int(lines[i].strip())
        except ValueError as exc:
            raise ValueError(
                f"Expected atom count on line {i + 1}, got {lines[i]!r}"
            ) from exc
        i += 1

        if i >= len(lines):
            raise ValueError("Unexpected end of file after atom count line.")

        # --- comment line (extended XYZ fields) ---
        comment = lines[i]
        i += 1

        charge = 0
        mult = 1
        metrics: dict[str, float] = {}

        m_charge = re.search(r"\bcharge=([+-]?\d+)", comment)
        if m_charge:
            charge = int(m_charge.group(1))
        m_mult = re.search(r"\bmult=(\d+)", comment)
        if m_mult:
            mult = int(m_mult.group(1))

        # Parse KEY=FLOAT tokens for metrics (skip charge/mult already captured)
        pat = r"\b([A-Za-z_][A-Za-z0-9_]*)=([+-]?(?:\d+\.?\d*|\.\d+)(?:[eE][+-]?\d+)?)"
        for tok in re.findall(pat, comment):
            key, val_str = tok
            if key in ("charge", "mult"):
                continue
            try:
                metrics[key] = float(val_str)
            except ValueError:
                pass

        # --- coordinate lines ---
        atoms: list[str] = []
        positions: list[tuple[float, float, float]] = []
        for _ in range(n_atoms):
            if i >= len(lines):
                raise ValueError(
                    f"Unexpected end of file: expected {n_atoms} coordinate lines."
                )
            parts = lines[i].split()
            i += 1
            if len(parts) < 4:
                raise ValueError(f"Malformed coordinate line: {lines[i - 1]!r}")
            atoms.append(parts[0])
            try:
                positions.append((float(parts[1]), float(parts[2]), float(parts[3])))
            except ValueError as exc:
                raise ValueError(f"Non-numeric coordinate in: {lines[i - 1]!r}") from exc

        frames.append((atoms, positions, charge, mult, metrics))

    return frames