Last active
November 20, 2025 15:05
-
-
Save veygax/707d044b8f84004dd77727aa14635ccf to your computer and use it in GitHub Desktop.
simple protobuf extractor for Java files.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # Java Protobuf Extractor | |
| # Copyright (c) 2025 [email protected] | |
| # SPDX-License-Identifier: MIT | |
| import re | |
| import sys | |
| from pathlib import Path | |
| from collections import defaultdict | |
| def extract_enum_values(java_content, enum_name): | |
| enum_pattern = rf'public enum {enum_name}.*?implements.*?\{{(.*?)\n\s*public static final int' | |
| enum_match = re.search(enum_pattern, java_content, re.DOTALL) | |
| if not enum_match: | |
| return {} | |
| enum_body = enum_match.group(1) | |
| entries = {} | |
| entry_pattern = r'(\w+)\((\w+)\)' | |
| for match in re.finditer(entry_pattern, enum_body): | |
| name = match.group(1) | |
| value_ref = match.group(2) | |
| entries[name] = value_ref | |
| values = {} | |
| value_pattern = rf'public static final int (\w+) = (\d+);' | |
| for match in re.finditer(value_pattern, java_content): | |
| const_name = match.group(1) | |
| const_value = match.group(2) | |
| values[const_name] = int(const_value) | |
| # resolve constant references to actual integer values | |
| result = {} | |
| for name, value_ref in entries.items(): | |
| if value_ref.isdigit(): | |
| result[name] = int(value_ref) | |
| elif value_ref in values: | |
| result[name] = values[value_ref] | |
| elif value_ref.endswith('_VALUE'): | |
| if value_ref in values: | |
| result[name] = values[value_ref] | |
| return result | |
| def extract_message_fields(java_content, class_name): | |
| class_pattern = rf'public static final class {class_name} extends GeneratedMessageLite.*?\{{(.*?)\n\s*private {class_name}' | |
| class_match = re.search(class_pattern, java_content, re.DOTALL) | |
| if not class_match: | |
| return [] | |
| class_body = class_match.group(1) | |
| fields = [] | |
| field_pattern = r'public static final int (\w+)_FIELD_NUMBER = (\d+);' | |
| for match in re.finditer(field_pattern, class_body): | |
| field_name = match.group(1).lower() | |
| field_number = int(match.group(2)) | |
| fields.append((field_number, field_name)) | |
| return sorted(fields, key=lambda x: x[0]) | |
| def infer_field_type(java_content, class_name, field_name): | |
| # convert snake_case to camelcase for matching java method names | |
| field_name_normalized = field_name.replace('_', '').lower() | |
| field_name_camel = ''.join(word.capitalize() for word in field_name.split('_')) | |
| # isolate the class section to avoid matching methods from other classes | |
| class_pattern = rf'public static final class {class_name}[^\{{]*\{{(.*?)(?=public static final class|\Z)' | |
| class_match = re.search(class_pattern, java_content, re.DOTALL) | |
| class_section = class_match.group(1) if class_match else java_content | |
| # try multiple getter patterns, prioritizing getters over field declarations as they show actual return types | |
| # avoid matching methods ending in "Bytes" as these are internal conversion methods | |
| getter_patterns = [ | |
| # repeated fields often have "List" suffix like getControllersList() | |
| rf'public (List<[\w<>\.]+>|Internal\.ProtobufList<[\w<>\.]+>) get{field_name_camel}List\s*\(', | |
| rf'public ([\w<>\.]+) get{field_name_camel}(?!Bytes)\s*\(', | |
| rf'public ([\w<>\.]+) get{field_name_normalized}(?!Bytes)\s*\(', | |
| rf'@Override[^\n]*\n[^\n]*public ([\w<>\.]+) get{field_name_camel}(?!Bytes)\s*\(', | |
| # check interface definitions which may have return types | |
| rf'([\w<>\.]+) get{field_name_camel}(?!Bytes)\s*\(\s*\)\s*;', | |
| ] | |
| java_type = None | |
| for pattern in getter_patterns: | |
| match = re.search(pattern, class_section, re.IGNORECASE) | |
| if match: | |
| java_type = match.group(1).strip() | |
| break | |
| # fallback to private field declarations (note: enum types often have int fields but return enums from getters) | |
| if not java_type: | |
| field_patterns = [ | |
| rf'private ([\w<>\.]+) {field_name}_', | |
| rf'private ([\w<>\.]+) {field_name.lower()}_', | |
| ] | |
| for pattern in field_patterns: | |
| match = re.search(pattern, class_section) | |
| if match: | |
| java_type = match.group(1).strip() | |
| break | |
| if not java_type: | |
| return 'bytes' | |
| type_map = { | |
| 'int': 'int32', | |
| 'long': 'int64', | |
| 'boolean': 'bool', | |
| 'String': 'string', | |
| 'ByteString': 'bytes', | |
| 'float': 'float', | |
| 'double': 'double', | |
| } | |
| # detect repeated fields from list types first before other processing | |
| if 'List<' in java_type or 'Internal.ProtobufList<' in java_type: | |
| inner_match = re.search(r'List<([\w\.]+)>', java_type) | |
| if inner_match: | |
| inner_type = inner_match.group(1).split('.')[-1] # handle qualified names like java.lang.String | |
| if inner_type in type_map: | |
| return f'repeated {type_map[inner_type]}' | |
| # check if inner type is an enum | |
| enum_pattern = rf'public enum {inner_type}.*?implements.*?Internal\.EnumLite' | |
| if re.search(enum_pattern, java_content, re.DOTALL): | |
| return f'repeated {inner_type}' | |
| # assume it's a message type | |
| return f'repeated {inner_type}' | |
| # check exact matches for primitive types | |
| if java_type in type_map: | |
| return type_map[java_type] | |
| # check if the type is an enum defined in the java file | |
| enum_pattern = rf'public enum {java_type}.*?implements.*?Internal\.EnumLite' | |
| if re.search(enum_pattern, java_content, re.DOTALL): | |
| return java_type | |
| # check for partial matches (e.g., java.lang.String contains String) | |
| for java_t, proto_t in type_map.items(): | |
| if java_t in java_type: | |
| return proto_t | |
| # assume capitalized types without dots are custom message types | |
| if java_type and java_type[0].isupper() and '.' not in java_type: | |
| return java_type | |
| return 'bytes' | |
| def generate_proto_file(java_file): | |
| print(f"reading {java_file}...") | |
| content = Path(java_file).read_text(errors='ignore') | |
| proto_lines = [] | |
| # detect proto syntax version by checking for 'has' methods (proto2 generates them, proto3 doesn't for primitives) | |
| has_methods = re.search(r'public boolean has\w+\(\)', content) | |
| is_proto2 = has_methods is not None | |
| syntax = "proto2" if is_proto2 else "proto3" | |
| proto_lines.append(f'syntax = "{syntax}";') | |
| # extract package name from java package declaration | |
| package_match = re.search(r'package\s+([\w\.]+)\s*;', content) | |
| if package_match: | |
| package_name = package_match.group(1) | |
| proto_lines.append(f'package {package_name};') | |
| proto_lines.append('') | |
| proto_lines.append(f'// generated from {java_file}') | |
| proto_lines.append('') | |
| print("extracting enums...") | |
| enum_pattern = r'public enum (\w+) implements Internal\.EnumLite' | |
| enum_names = re.findall(enum_pattern, content) | |
| print(f"found {len(enum_names)} enums") | |
| for enum_name in enum_names: | |
| enum_values = extract_enum_values(content, enum_name) | |
| if enum_values: | |
| proto_lines.append(f'enum {enum_name} {{') | |
| for name, value in sorted(enum_values.items(), key=lambda x: x[1]): | |
| proto_lines.append(f' {name} = {value};') | |
| proto_lines.append('}') | |
| proto_lines.append('') | |
| # find all message classes including request, response, and custom types | |
| print("extracting message definitions...") | |
| all_messages = [] | |
| # find all classes that extend generatedmessagelite | |
| class_pattern = r'public static final class (\w+) extends GeneratedMessageLite' | |
| for match in re.finditer(class_pattern, content): | |
| class_name = match.group(1) | |
| # skip builder classes | |
| if class_name != 'Builder': | |
| all_messages.append(class_name) | |
| all_messages = sorted(set(all_messages)) | |
| print(f"found {len(all_messages)} message types") | |
| for msg_class in all_messages: | |
| fields = extract_message_fields(content, msg_class) | |
| if fields: | |
| proto_lines.append(f'message {msg_class} {{') | |
| for num, name in fields: | |
| field_type = infer_field_type(content, msg_class, name) | |
| # use 'optional' for non-repeated fields | |
| if field_type.startswith('repeated '): | |
| proto_lines.append(f' {field_type} {name} = {num};') | |
| else: | |
| proto_lines.append(f' optional {field_type} {name} = {num};') | |
| proto_lines.append('}') | |
| proto_lines.append('') | |
| return '\n'.join(proto_lines) | |
| if __name__ == '__main__': | |
| if len(sys.argv) > 1: | |
| java_file = sys.argv[1] | |
| if not Path(java_file).exists(): | |
| print(f"error: {java_file} not found") | |
| sys.exit(1) | |
| proto_content = generate_proto_file(java_file) | |
| # use input filename without path as output filename | |
| output_file = Path(java_file).stem + '.proto' | |
| Path(output_file).write_text(proto_content) | |
| print(f"\ngenerated {output_file}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment