hugo-template/assets/bib2yaml.py

126 lines
4.1 KiB
Python

import sys
import re
def remove_comments(text):
return re.sub(r'(?m)^[ \t]*%.*\n?', '', text)
def find_entries(bibtex_content):
entry_regex = re.compile(r'@(\w+)\s*[{(]', re.IGNORECASE)
positions = [m.start() for m in entry_regex.finditer(bibtex_content)]
entries = []
for i, start in enumerate(positions):
end = positions[i+1] if i+1 < len(positions) else len(bibtex_content)
entries.append(bibtex_content[start:end].strip())
return entries
def parse_bibtex_entry(entry):
entry = entry.strip()
if not entry or entry.startswith('%'):
return None
m = re.match(r'@(\w+)\s*[{(]\s*([^,]+),?', entry, re.DOTALL)
if not m:
return None
entry_type, entry_key = m.groups()
entry_type = entry_type.lower()
if entry_type in ('comment', 'preamble'):
return None
if entry_type == 'string':
return None
body = entry[m.end():]
body = body.rstrip('})').strip()
fields = {}
field_regex = re.compile(r'(\w+)\s*=\s*', re.DOTALL)
pos = 0
while pos < len(body):
m = field_regex.match(body, pos)
if not m:
break
field = m.group(1)
pos = m.end()
if body[pos] == '{':
brace_level = 1
val_start = pos + 1
pos += 1
while pos < len(body) and brace_level > 0:
if body[pos] == '{':
brace_level += 1
elif body[pos] == '}':
brace_level -= 1
pos += 1
value = body[val_start:pos-1]
elif body[pos] == '"':
val_start = pos + 1
pos += 1
while pos < len(body) and body[pos] != '"':
if body[pos] == '\\':
pos += 2
else:
pos += 1
value = body[val_start:pos]
pos += 1
else:
val_start = pos
while pos < len(body) and body[pos] not in ',\n':
pos += 1
value = body[val_start:pos].strip()
fields[field] = value.replace('\n', ' ').strip()
while pos < len(body) and body[pos] in ', \n\r\t':
pos += 1
return entry_type, entry_key, fields
def escape_yaml_string(s):
# Escape double quotes and backslashes, wrap in double quotes if needed
if not s:
return '""'
if any(c in s for c in ':\'"{}[],&*#?|-<>=!%@`\\\n'):
s = s.replace('\\', '\\\\').replace('"', '\\"')
return f'"{s}"'
return s
def dict_to_yaml(d, indent=0):
lines = []
for k, v in d.items():
if isinstance(v, list):
lines.append(' ' * indent + f"{k}:")
for item in v:
lines.append(' ' * (indent + 2) + f"- {escape_yaml_string(item)}")
else:
lines.append(' ' * indent + f"{k}: {escape_yaml_string(v)}")
return '\n'.join(lines)
def bibtex_to_yaml(bibtex_content):
bibtex_content = remove_comments(bibtex_content)
entries = find_entries(bibtex_content)
yaml_entries = []
for entry in entries:
parsed = parse_bibtex_entry(entry)
if not parsed:
continue
entry_type, entry_key, fields = parsed
entry_dict = {'key': entry_key}
if "author" in fields:
authors = [a.strip() for a in re.split(r'\s+and\s+', fields["author"])]
entry_dict['author'] = authors
del fields["author"]
entry_dict.update(fields)
yaml_entries.append(entry_dict)
yaml_str = "publications:\n"
for entry in yaml_entries:
yaml_str += " - " + dict_to_yaml(entry, indent=4).lstrip().replace('\n', '\n ') + "\n"
return yaml_str
def main():
if len(sys.argv) != 2:
print(f"Usage: {sys.argv[0]} <input.bib>")
sys.exit(1)
bibfile = sys.argv[1]
with open(bibfile, 'r', encoding='utf-8') as f:
bibtex_content = f.read()
yaml_content = bibtex_to_yaml(bibtex_content)
yamlfile = bibfile.rsplit('.', 1)[0] + '.yaml'
with open(yamlfile, 'w', encoding='utf-8') as f:
f.write(yaml_content)
print(f"YAML exported to {yamlfile}")
if __name__ == "__main__":
main()