126 lines
4.1 KiB
Python
126 lines
4.1 KiB
Python
import sys
|
|
import re
|
|
def remove_comments(text):
|
|
return re.sub(r'(?m)^[ \t]*%.*\n?', '', text)
|
|
|
|
def find_entries(bibtex_content):
|
|
entry_regex = re.compile(r'@(\w+)\s*[{(]', re.IGNORECASE)
|
|
positions = [m.start() for m in entry_regex.finditer(bibtex_content)]
|
|
entries = []
|
|
for i, start in enumerate(positions):
|
|
end = positions[i+1] if i+1 < len(positions) else len(bibtex_content)
|
|
entries.append(bibtex_content[start:end].strip())
|
|
return entries
|
|
|
|
def parse_bibtex_entry(entry):
|
|
entry = entry.strip()
|
|
if not entry or entry.startswith('%'):
|
|
return None
|
|
m = re.match(r'@(\w+)\s*[{(]\s*([^,]+),?', entry, re.DOTALL)
|
|
if not m:
|
|
return None
|
|
entry_type, entry_key = m.groups()
|
|
entry_type = entry_type.lower()
|
|
if entry_type in ('comment', 'preamble'):
|
|
return None
|
|
if entry_type == 'string':
|
|
return None
|
|
body = entry[m.end():]
|
|
body = body.rstrip('})').strip()
|
|
fields = {}
|
|
field_regex = re.compile(r'(\w+)\s*=\s*', re.DOTALL)
|
|
pos = 0
|
|
while pos < len(body):
|
|
m = field_regex.match(body, pos)
|
|
if not m:
|
|
break
|
|
field = m.group(1)
|
|
pos = m.end()
|
|
if body[pos] == '{':
|
|
brace_level = 1
|
|
val_start = pos + 1
|
|
pos += 1
|
|
while pos < len(body) and brace_level > 0:
|
|
if body[pos] == '{':
|
|
brace_level += 1
|
|
elif body[pos] == '}':
|
|
brace_level -= 1
|
|
pos += 1
|
|
value = body[val_start:pos-1]
|
|
elif body[pos] == '"':
|
|
val_start = pos + 1
|
|
pos += 1
|
|
while pos < len(body) and body[pos] != '"':
|
|
if body[pos] == '\\':
|
|
pos += 2
|
|
else:
|
|
pos += 1
|
|
value = body[val_start:pos]
|
|
pos += 1
|
|
else:
|
|
val_start = pos
|
|
while pos < len(body) and body[pos] not in ',\n':
|
|
pos += 1
|
|
value = body[val_start:pos].strip()
|
|
fields[field] = value.replace('\n', ' ').strip()
|
|
while pos < len(body) and body[pos] in ', \n\r\t':
|
|
pos += 1
|
|
return entry_type, entry_key, fields
|
|
|
|
def escape_yaml_string(s):
|
|
# Escape double quotes and backslashes, wrap in double quotes if needed
|
|
if not s:
|
|
return '""'
|
|
if any(c in s for c in ':\'"{}[],&*#?|-<>=!%@`\\\n'):
|
|
s = s.replace('\\', '\\\\').replace('"', '\\"')
|
|
return f'"{s}"'
|
|
return s
|
|
|
|
def dict_to_yaml(d, indent=0):
|
|
lines = []
|
|
for k, v in d.items():
|
|
if isinstance(v, list):
|
|
lines.append(' ' * indent + f"{k}:")
|
|
for item in v:
|
|
lines.append(' ' * (indent + 2) + f"- {escape_yaml_string(item)}")
|
|
else:
|
|
lines.append(' ' * indent + f"{k}: {escape_yaml_string(v)}")
|
|
return '\n'.join(lines)
|
|
|
|
def bibtex_to_yaml(bibtex_content):
|
|
bibtex_content = remove_comments(bibtex_content)
|
|
entries = find_entries(bibtex_content)
|
|
yaml_entries = []
|
|
for entry in entries:
|
|
parsed = parse_bibtex_entry(entry)
|
|
if not parsed:
|
|
continue
|
|
entry_type, entry_key, fields = parsed
|
|
entry_dict = {'key': entry_key}
|
|
if "author" in fields:
|
|
authors = [a.strip() for a in re.split(r'\s+and\s+', fields["author"])]
|
|
entry_dict['author'] = authors
|
|
del fields["author"]
|
|
entry_dict.update(fields)
|
|
yaml_entries.append(entry_dict)
|
|
yaml_str = "publications:\n"
|
|
for entry in yaml_entries:
|
|
yaml_str += " - " + dict_to_yaml(entry, indent=4).lstrip().replace('\n', '\n ') + "\n"
|
|
return yaml_str
|
|
|
|
def main():
|
|
if len(sys.argv) != 2:
|
|
print(f"Usage: {sys.argv[0]} <input.bib>")
|
|
sys.exit(1)
|
|
bibfile = sys.argv[1]
|
|
with open(bibfile, 'r', encoding='utf-8') as f:
|
|
bibtex_content = f.read()
|
|
yaml_content = bibtex_to_yaml(bibtex_content)
|
|
yamlfile = bibfile.rsplit('.', 1)[0] + '.yaml'
|
|
with open(yamlfile, 'w', encoding='utf-8') as f:
|
|
f.write(yaml_content)
|
|
print(f"YAML exported to {yamlfile}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|