Created
December 11, 2025 22:58
-
-
Save dewomser/cfe6a458ee989dc5126ca06c9c70ea13 to your computer and use it in GitHub Desktop.
Grüne Politiker aus Worms von der Webseite gekratzt im JSON Format
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import subprocess | |
| from bs4 import BeautifulSoup | |
| import json | |
| def fetch_html_with_curl(url): | |
| """ | |
| Ruft den HTML-Inhalt von einer gegebenen URL mit curl ab. | |
| """ | |
| try: | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| } | |
| result = subprocess.run( | |
| ['curl', '-A', headers['User-Agent'], '-L', url], | |
| capture_output=True, | |
| text=True, | |
| check=True | |
| ) | |
| return result.stdout | |
| except (subprocess.CalledProcessError, FileNotFoundError) as e: | |
| print(f"Fehler beim Abrufen von {url} mit curl: {e}") | |
| return None | |
| def parse_gruene_fraktion(html_content): | |
| """ | |
| Parst die HTML-Inhalte der Grünen-Stadtratsfraktions-Seite. | |
| """ | |
| if not html_content: | |
| return [] | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| politicians = [] | |
| # Jeder Eintrag ist ein 'wp-block-media-text' div | |
| media_blocks = soup.select('.wp-block-media-text') | |
| for block in media_blocks: | |
| content_div = block.select_one('.wp-block-media-text__content') | |
| if not content_div: | |
| continue | |
| # Name ist im ersten <p>-Tag, oft mit der Klasse 'has-large-font-size' | |
| name_tag = content_div.select_one('p.has-large-font-size') | |
| if not name_tag: | |
| continue | |
| # Name extrahieren, auch wenn er in einem <strong>-Tag ist | |
| name = name_tag.get_text(strip=True) | |
| # Funktionen aus den nachfolgenden <p>-Tags extrahieren | |
| functions = [] | |
| function_tags = content_div.select('p:not(.has-large-font-size)') | |
| for tag in function_tags: | |
| func_text = tag.get_text(strip=True) | |
| # Ignoriere leere Zeilen und E-Mail-Adressen | |
| if func_text and '[at]' not in func_text: | |
| functions.append(func_text) | |
| if name and functions: | |
| politicians.append({"name": name, "functions": functions}) | |
| return sorted(politicians, key=lambda x: x["name"]) | |
| def main(): | |
| """ | |
| Hauptfunktion des Skripts. | |
| """ | |
| fraktion_url = "https://gruene-worms.de/stadtratsfraktion/" | |
| output_filename = "gruene_politiker_worms.json" | |
| print(f"Rufe HTML von {fraktion_url} ab...") | |
| html_content = fetch_html_with_curl(fraktion_url) | |
| if not html_content: | |
| print("Konnte die Webseite nicht abrufen. Das Skript wird beendet.") | |
| return | |
| print("Parse Fraktions-Daten...") | |
| fraktion_data = parse_gruene_fraktion(html_content) | |
| try: | |
| with open(output_filename, 'w', encoding='utf-8') as f: | |
| json.dump(fraktion_data, f, ensure_ascii=False, indent=2) | |
| print(f"Erfolgreich {output_filename} erstellt.") | |
| except IOError as e: | |
| print(f"Fehler beim Schreiben der Datei {output_filename}: {e}") | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| [ | |
| { | |
| "name": "Anna Biegler", | |
| "functions": [ | |
| "Fraktionsvorsitzende" | |
| ] | |
| }, | |
| { | |
| "name": "Carolin Cloos", | |
| "functions": [ | |
| "Stellvertretende Fraktionsvorsitzende" | |
| ] | |
| }, | |
| { | |
| "name": "Heike Jores", | |
| "functions": [ | |
| "Stellvertretende Fraktionsvorsitzende" | |
| ] | |
| }, | |
| { | |
| "name": "Leonhard Schmitt", | |
| "functions": [ | |
| "Stellvertretender Fraktionsvorsitzender", | |
| "Sprecher für Soziales" | |
| ] | |
| } | |
| ] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment