Skip to content

Instantly share code, notes, and snippets.

@veygax
Last active November 20, 2025 15:05
Show Gist options
  • Select an option

  • Save veygax/707d044b8f84004dd77727aa14635ccf to your computer and use it in GitHub Desktop.

Select an option

Save veygax/707d044b8f84004dd77727aa14635ccf to your computer and use it in GitHub Desktop.
simple protobuf extractor for Java files.
#!/usr/bin/env python3
# Java Protobuf Extractor
# Copyright (c) 2025 [email protected]
# SPDX-License-Identifier: MIT
import re
import sys
from pathlib import Path
from collections import defaultdict
def extract_enum_values(java_content, enum_name):
enum_pattern = rf'public enum {enum_name}.*?implements.*?\{{(.*?)\n\s*public static final int'
enum_match = re.search(enum_pattern, java_content, re.DOTALL)
if not enum_match:
return {}
enum_body = enum_match.group(1)
entries = {}
entry_pattern = r'(\w+)\((\w+)\)'
for match in re.finditer(entry_pattern, enum_body):
name = match.group(1)
value_ref = match.group(2)
entries[name] = value_ref
values = {}
value_pattern = rf'public static final int (\w+) = (\d+);'
for match in re.finditer(value_pattern, java_content):
const_name = match.group(1)
const_value = match.group(2)
values[const_name] = int(const_value)
# resolve constant references to actual integer values
result = {}
for name, value_ref in entries.items():
if value_ref.isdigit():
result[name] = int(value_ref)
elif value_ref in values:
result[name] = values[value_ref]
elif value_ref.endswith('_VALUE'):
if value_ref in values:
result[name] = values[value_ref]
return result
def extract_message_fields(java_content, class_name):
class_pattern = rf'public static final class {class_name} extends GeneratedMessageLite.*?\{{(.*?)\n\s*private {class_name}'
class_match = re.search(class_pattern, java_content, re.DOTALL)
if not class_match:
return []
class_body = class_match.group(1)
fields = []
field_pattern = r'public static final int (\w+)_FIELD_NUMBER = (\d+);'
for match in re.finditer(field_pattern, class_body):
field_name = match.group(1).lower()
field_number = int(match.group(2))
fields.append((field_number, field_name))
return sorted(fields, key=lambda x: x[0])
def infer_field_type(java_content, class_name, field_name):
# convert snake_case to camelcase for matching java method names
field_name_normalized = field_name.replace('_', '').lower()
field_name_camel = ''.join(word.capitalize() for word in field_name.split('_'))
# isolate the class section to avoid matching methods from other classes
class_pattern = rf'public static final class {class_name}[^\{{]*\{{(.*?)(?=public static final class|\Z)'
class_match = re.search(class_pattern, java_content, re.DOTALL)
class_section = class_match.group(1) if class_match else java_content
# try multiple getter patterns, prioritizing getters over field declarations as they show actual return types
# avoid matching methods ending in "Bytes" as these are internal conversion methods
getter_patterns = [
# repeated fields often have "List" suffix like getControllersList()
rf'public (List<[\w<>\.]+>|Internal\.ProtobufList<[\w<>\.]+>) get{field_name_camel}List\s*\(',
rf'public ([\w<>\.]+) get{field_name_camel}(?!Bytes)\s*\(',
rf'public ([\w<>\.]+) get{field_name_normalized}(?!Bytes)\s*\(',
rf'@Override[^\n]*\n[^\n]*public ([\w<>\.]+) get{field_name_camel}(?!Bytes)\s*\(',
# check interface definitions which may have return types
rf'([\w<>\.]+) get{field_name_camel}(?!Bytes)\s*\(\s*\)\s*;',
]
java_type = None
for pattern in getter_patterns:
match = re.search(pattern, class_section, re.IGNORECASE)
if match:
java_type = match.group(1).strip()
break
# fallback to private field declarations (note: enum types often have int fields but return enums from getters)
if not java_type:
field_patterns = [
rf'private ([\w<>\.]+) {field_name}_',
rf'private ([\w<>\.]+) {field_name.lower()}_',
]
for pattern in field_patterns:
match = re.search(pattern, class_section)
if match:
java_type = match.group(1).strip()
break
if not java_type:
return 'bytes'
type_map = {
'int': 'int32',
'long': 'int64',
'boolean': 'bool',
'String': 'string',
'ByteString': 'bytes',
'float': 'float',
'double': 'double',
}
# detect repeated fields from list types first before other processing
if 'List<' in java_type or 'Internal.ProtobufList<' in java_type:
inner_match = re.search(r'List<([\w\.]+)>', java_type)
if inner_match:
inner_type = inner_match.group(1).split('.')[-1] # handle qualified names like java.lang.String
if inner_type in type_map:
return f'repeated {type_map[inner_type]}'
# check if inner type is an enum
enum_pattern = rf'public enum {inner_type}.*?implements.*?Internal\.EnumLite'
if re.search(enum_pattern, java_content, re.DOTALL):
return f'repeated {inner_type}'
# assume it's a message type
return f'repeated {inner_type}'
# check exact matches for primitive types
if java_type in type_map:
return type_map[java_type]
# check if the type is an enum defined in the java file
enum_pattern = rf'public enum {java_type}.*?implements.*?Internal\.EnumLite'
if re.search(enum_pattern, java_content, re.DOTALL):
return java_type
# check for partial matches (e.g., java.lang.String contains String)
for java_t, proto_t in type_map.items():
if java_t in java_type:
return proto_t
# assume capitalized types without dots are custom message types
if java_type and java_type[0].isupper() and '.' not in java_type:
return java_type
return 'bytes'
def generate_proto_file(java_file):
print(f"reading {java_file}...")
content = Path(java_file).read_text(errors='ignore')
proto_lines = []
# detect proto syntax version by checking for 'has' methods (proto2 generates them, proto3 doesn't for primitives)
has_methods = re.search(r'public boolean has\w+\(\)', content)
is_proto2 = has_methods is not None
syntax = "proto2" if is_proto2 else "proto3"
proto_lines.append(f'syntax = "{syntax}";')
# extract package name from java package declaration
package_match = re.search(r'package\s+([\w\.]+)\s*;', content)
if package_match:
package_name = package_match.group(1)
proto_lines.append(f'package {package_name};')
proto_lines.append('')
proto_lines.append(f'// generated from {java_file}')
proto_lines.append('')
print("extracting enums...")
enum_pattern = r'public enum (\w+) implements Internal\.EnumLite'
enum_names = re.findall(enum_pattern, content)
print(f"found {len(enum_names)} enums")
for enum_name in enum_names:
enum_values = extract_enum_values(content, enum_name)
if enum_values:
proto_lines.append(f'enum {enum_name} {{')
for name, value in sorted(enum_values.items(), key=lambda x: x[1]):
proto_lines.append(f' {name} = {value};')
proto_lines.append('}')
proto_lines.append('')
# find all message classes including request, response, and custom types
print("extracting message definitions...")
all_messages = []
# find all classes that extend generatedmessagelite
class_pattern = r'public static final class (\w+) extends GeneratedMessageLite'
for match in re.finditer(class_pattern, content):
class_name = match.group(1)
# skip builder classes
if class_name != 'Builder':
all_messages.append(class_name)
all_messages = sorted(set(all_messages))
print(f"found {len(all_messages)} message types")
for msg_class in all_messages:
fields = extract_message_fields(content, msg_class)
if fields:
proto_lines.append(f'message {msg_class} {{')
for num, name in fields:
field_type = infer_field_type(content, msg_class, name)
# use 'optional' for non-repeated fields
if field_type.startswith('repeated '):
proto_lines.append(f' {field_type} {name} = {num};')
else:
proto_lines.append(f' optional {field_type} {name} = {num};')
proto_lines.append('}')
proto_lines.append('')
return '\n'.join(proto_lines)
if __name__ == '__main__':
if len(sys.argv) > 1:
java_file = sys.argv[1]
if not Path(java_file).exists():
print(f"error: {java_file} not found")
sys.exit(1)
proto_content = generate_proto_file(java_file)
# use input filename without path as output filename
output_file = Path(java_file).stem + '.proto'
Path(output_file).write_text(proto_content)
print(f"\ngenerated {output_file}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment