hugo-template/assets/bib2yaml.py

import sys
import re
def remove_comments(text):
    return re.sub(r'(?m)^[ \t]*%.*\n?', '', text)

def find_entries(bibtex_content):
    entry_regex = re.compile(r'@(\w+)\s*[{(]', re.IGNORECASE)
    positions = [m.start() for m in entry_regex.finditer(bibtex_content)]
    entries = []
    for i, start in enumerate(positions):
        end = positions[i+1] if i+1 < len(positions) else len(bibtex_content)
        entries.append(bibtex_content[start:end].strip())
    return entries

def parse_bibtex_entry(entry):
    entry = entry.strip()
    if not entry or entry.startswith('%'):
        return None
    m = re.match(r'@(\w+)\s*[{(]\s*([^,]+),?', entry, re.DOTALL)
    if not m:
        return None
    entry_type, entry_key = m.groups()
    entry_type = entry_type.lower()
    if entry_type in ('comment', 'preamble'):
        return None
    if entry_type == 'string':
        return None
    body = entry[m.end():]
    body = body.rstrip('})').strip()
    fields = {}
    field_regex = re.compile(r'(\w+)\s*=\s*', re.DOTALL)
    pos = 0
    while pos < len(body):
        m = field_regex.match(body, pos)
        if not m:
            break
        field = m.group(1)
        pos = m.end()
        if body[pos] == '{':
            brace_level = 1
            val_start = pos + 1
            pos += 1
            while pos < len(body) and brace_level > 0:
                if body[pos] == '{':
                    brace_level += 1
                elif body[pos] == '}':
                    brace_level -= 1
                pos += 1
            value = body[val_start:pos-1]
        elif body[pos] == '"':
            val_start = pos + 1
            pos += 1
            while pos < len(body) and body[pos] != '"':
                if body[pos] == '\\':
                    pos += 2
                else:
                    pos += 1
            value = body[val_start:pos]
            pos += 1
        else:
            val_start = pos
            while pos < len(body) and body[pos] not in ',\n':
                pos += 1
            value = body[val_start:pos].strip()
        fields[field] = value.replace('\n', ' ').strip()
        while pos < len(body) and body[pos] in ', \n\r\t':
            pos += 1
    return entry_type, entry_key, fields

def escape_yaml_string(s):
    # Escape double quotes and backslashes, wrap in double quotes if needed
    if not s:
        return '""'
    if any(c in s for c in ':\'"{}[],&*#?|-<>=!%@`\\\n'):
        s = s.replace('\\', '\\\\').replace('"', '\\"')
        return f'"{s}"'
    return s

def dict_to_yaml(d, indent=0):
    lines = []
    for k, v in d.items():
        if isinstance(v, list):
            lines.append(' ' * indent + f"{k}:")
            for item in v:
                lines.append(' ' * (indent + 2) + f"- {escape_yaml_string(item)}")
        else:
            lines.append(' ' * indent + f"{k}: {escape_yaml_string(v)}")
    return '\n'.join(lines)

def bibtex_to_yaml(bibtex_content):
    bibtex_content = remove_comments(bibtex_content)
    entries = find_entries(bibtex_content)
    yaml_entries = []
    for entry in entries:
        parsed = parse_bibtex_entry(entry)
        if not parsed:
            continue
        entry_type, entry_key, fields = parsed
        entry_dict = {'key': entry_key}
        if "author" in fields:
            authors = [a.strip() for a in re.split(r'\s+and\s+', fields["author"])]
            entry_dict['author'] = authors
            del fields["author"]
        entry_dict.update(fields)
        yaml_entries.append(entry_dict)
    yaml_str = "publications:\n"
    for entry in yaml_entries:
        yaml_str += "  - " + dict_to_yaml(entry, indent=4).lstrip().replace('\n', '\n    ') + "\n"
    return yaml_str

def main():
    if len(sys.argv) != 2:
        print(f"Usage: {sys.argv[0]} <input.bib>")
        sys.exit(1)
    bibfile = sys.argv[1]
    with open(bibfile, 'r', encoding='utf-8') as f:
        bibtex_content = f.read()
    yaml_content = bibtex_to_yaml(bibtex_content)
    yamlfile = bibfile.rsplit('.', 1)[0] + '.yaml'
    with open(yamlfile, 'w', encoding='utf-8') as f:
        f.write(yaml_content)
    print(f"YAML exported to {yamlfile}")

if __name__ == "__main__":
    main()