Instantly share code, notes, and snippets.
Created
December 2, 2025 15:23
-
Star
0
(0)
You must be signed in to star a gist -
Fork
0
(0)
You must be signed in to fork a gist
-
-
Save db-pj/a02c22c42b0f1f7ed90f6bc603b91096 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| /** | |
| * Tag Assignment Script for BadCredit.org | |
| * | |
| * Analyzes all published posts and assigns one main tag and optionally one subtag | |
| * based on the tag definitions generated by generate-tags.php | |
| * | |
| * Output: CSV file with post assignments in volumes/phpfpm/scripts/tags/output/ | |
| */ | |
| // WordPress Bootstrap | |
| $public_html = getenv('PUBLIC_HTML'); | |
| if (!$public_html || !file_exists($public_html . '/wp-load.php')) { | |
| echo "L Error: PUBLIC_HTML environment variable not set or wp-load.php not found\n"; | |
| exit(1); | |
| } | |
| require_once($public_html . '/wp-load.php'); | |
| // Configuration | |
| const OPENAI_API_KEY = ''; | |
| const OPENAI_MODEL = 'gpt-4o'; | |
| const BATCH_SIZE = 20; // Process posts in batches | |
| const DEV_MODE = true; // If true, only process 50 posts for testing | |
| const OUTPUT_DIR = __DIR__ . '/output/'; | |
| const TAG_DEFINITIONS_FILE = __DIR__ . '/output/tag-definitions.json'; | |
| // Ensure output directory exists | |
| if (!is_dir(OUTPUT_DIR)) { | |
| mkdir(OUTPUT_DIR, 0755, true); | |
| } | |
| echo "Starting Tag Assignment Process\n"; | |
| if (DEV_MODE) { | |
| echo "*** DEV MODE: Only processing 50 posts ***\n"; | |
| } | |
| echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"; | |
| /** | |
| * Load tag definitions from JSON file | |
| */ | |
| function load_tag_definitions() { | |
| if (!file_exists(TAG_DEFINITIONS_FILE)) { | |
| echo "L Error: Tag definitions file not found at " . TAG_DEFINITIONS_FILE . "\n"; | |
| echo " Please run generate-tags.php first to create the tag definitions.\n"; | |
| exit(1); | |
| } | |
| $json = file_get_contents(TAG_DEFINITIONS_FILE); | |
| $definitions = json_decode($json, true); | |
| if (!$definitions || !isset($definitions['main_tags']) || !isset($definitions['subtags'])) { | |
| echo "L Error: Invalid tag definitions format\n"; | |
| exit(1); | |
| } | |
| return $definitions; | |
| } | |
| /** | |
| * Query all published posts | |
| */ | |
| function get_all_posts() { | |
| $limit = DEV_MODE ? 100 : -1; | |
| $args = array( | |
| 'post_type' => 'post', | |
| 'post_status' => 'publish', | |
| 'posts_per_page' => $limit, | |
| 'orderby' => 'date', | |
| 'order' => 'DESC', | |
| ); | |
| $query = new WP_Query($args); | |
| return $query->posts; | |
| } | |
| /** | |
| * Extract clean text from post content | |
| */ | |
| function extract_clean_content($post) { | |
| $content = $post->post_content; | |
| // Remove shortcodes | |
| $content = strip_shortcodes($content); | |
| // Remove HTML tags | |
| $content = wp_strip_all_tags($content); | |
| // Remove extra whitespace | |
| $content = preg_replace('/\s+/', ' ', $content); | |
| // Limit to first 3000 characters for API efficiency | |
| $content = substr($content, 0, 3000); | |
| return trim($content); | |
| } | |
| /** | |
| * Get post data for assignment | |
| */ | |
| function get_post_data($post) { | |
| $categories = wp_get_post_categories($post->ID, array('fields' => 'names')); | |
| return array( | |
| 'post_id' => $post->ID, | |
| 'title' => $post->post_title, | |
| 'url' => get_permalink($post->ID), | |
| 'content' => extract_clean_content($post), | |
| 'categories' => $categories, | |
| ); | |
| } | |
| /** | |
| * Build prompt for tag assignment | |
| */ | |
| function build_prompt($posts_data, $tag_definitions) { | |
| $main_tags_list = ""; | |
| foreach ($tag_definitions['main_tags'] as $tag => $data) { | |
| $main_tags_list .= "- **$tag**: {$data['definition']}\n"; | |
| } | |
| $subtags_list = ""; | |
| foreach ($tag_definitions['subtags'] as $parent => $subtags) { | |
| $subtags_list .= "\n**$parent** subtags:\n"; | |
| foreach ($subtags as $subtag => $data) { | |
| $subtags_list .= " - **$subtag**: {$data['definition']}\n"; | |
| } | |
| } | |
| $articles_json = json_encode($posts_data, JSON_PRETTY_PRINT); | |
| return "You are a content categorization expert for BadCredit.org, a financial advice website focused on helping people with bad credit. | |
| Your task is to assign ONE main tag and OPTIONALLY one subtag to each article based on its content. | |
| ## Available Tags: | |
| ### Main Tags: | |
| $main_tags_list | |
| ### Subtags (organized by parent): | |
| $subtags_list | |
| ## Assignment Rules: | |
| 1. **Prioritize the title heavily (60% weight)** - The article title is the most important indicator of its primary topic | |
| 2. Consider content (40% weight) - The article body provides supporting context | |
| 3. **Assign exactly ONE main tag** - Choose the single most relevant tag | |
| 4. **Optionally assign ONE subtag** - Only if the article clearly focuses on that specific subtopic | |
| 5. **Subtag must match parent** - If you assign a subtag, it must be a child of the assigned main tag | |
| 6. If unsure between tags, choose the more specific one | |
| 7. If an article doesn't clearly fit any subtag, leave subtag empty | |
| ## Articles to Categorize: | |
| $articles_json | |
| ## Response Format: | |
| Respond with ONLY a valid JSON object in this exact format: | |
| { | |
| \"assignments\": [ | |
| { | |
| \"post_id\": 123, | |
| \"main_tag\": \"Tag Name\", | |
| \"subtag\": \"Subtag Name\" | |
| }, | |
| { | |
| \"post_id\": 124, | |
| \"main_tag\": \"Tag Name\", | |
| \"subtag\": \"\" | |
| } | |
| ] | |
| } | |
| IMPORTANT: | |
| - Use exact tag names from the definitions above | |
| - Every article must have a main_tag | |
| - subtag can be empty string \"\" if not applicable | |
| - Respond ONLY with valid JSON, no other text"; | |
| } | |
| /** | |
| * Call OpenAI API to assign tags | |
| */ | |
| function assign_tags_batch($posts_data, $tag_definitions) { | |
| $prompt = build_prompt($posts_data, $tag_definitions); | |
| $data = array( | |
| 'model' => OPENAI_MODEL, | |
| 'messages' => array( | |
| array('role' => 'system', 'content' => 'You are a content categorization expert. Respond only with valid JSON.'), | |
| array('role' => 'user', 'content' => $prompt) | |
| ), | |
| 'temperature' => 0.2, | |
| 'max_tokens' => 2000, | |
| ); | |
| $ch = curl_init('https://api.openai.com/v1/chat/completions'); | |
| curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |
| curl_setopt($ch, CURLOPT_POST, true); | |
| curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($data)); | |
| curl_setopt($ch, CURLOPT_HTTPHEADER, array( | |
| 'Content-Type: application/json', | |
| 'Authorization: Bearer ' . OPENAI_API_KEY | |
| )); | |
| $response = curl_exec($ch); | |
| $http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE); | |
| curl_close($ch); | |
| if ($http_code !== 200) { | |
| echo "API Error (HTTP $http_code): $response\n"; | |
| return null; | |
| } | |
| $result = json_decode($response, true); | |
| if (isset($result['choices'][0]['message']['content'])) { | |
| $content = $result['choices'][0]['message']['content']; | |
| // Remove markdown code blocks if present | |
| $content = preg_replace('/```json\s*/', '', $content); | |
| $content = preg_replace('/```\s*$/', '', $content); | |
| $assignments = json_decode($content, true); | |
| if (isset($assignments['assignments'])) { | |
| return $assignments['assignments']; | |
| } | |
| } | |
| return null; | |
| } | |
| /** | |
| * Write assignments to CSV | |
| */ | |
| function write_csv($all_assignments, $output_file) { | |
| $fp = fopen($output_file, 'w'); | |
| // Write header | |
| fputcsv($fp, array('post_id', 'url', 'title', 'main_tag', 'subtag')); | |
| // Write data | |
| foreach ($all_assignments as $assignment) { | |
| fputcsv($fp, array( | |
| $assignment['post_id'], | |
| $assignment['url'], | |
| $assignment['title'], | |
| $assignment['main_tag'], | |
| $assignment['subtag'] ?? '' | |
| )); | |
| } | |
| fclose($fp); | |
| } | |
| /** | |
| * Generate tag count summary | |
| */ | |
| function generate_tag_counts($all_assignments, $output_file) { | |
| $main_counts = array(); | |
| $subtag_counts = array(); | |
| // Count main tags and subtags separately | |
| foreach ($all_assignments as $assignment) { | |
| $main = $assignment['main_tag']; | |
| $sub = $assignment['subtag'] ?? ''; | |
| // Count main tag | |
| if (!isset($main_counts[$main])) { | |
| $main_counts[$main] = 0; | |
| } | |
| $main_counts[$main]++; | |
| // Count subtag if present | |
| if (!empty($sub)) { | |
| if (!isset($subtag_counts[$main])) { | |
| $subtag_counts[$main] = array(); | |
| } | |
| if (!isset($subtag_counts[$main][$sub])) { | |
| $subtag_counts[$main][$sub] = 0; | |
| } | |
| $subtag_counts[$main][$sub]++; | |
| } | |
| } | |
| // Sort main tags by count descending | |
| arsort($main_counts); | |
| $fp = fopen($output_file, 'w'); | |
| fputcsv($fp, array('Main Tag', 'Subtag', 'Article Count')); | |
| // Write data: parent tag followed by its subtags | |
| foreach ($main_counts as $main_tag => $main_count) { | |
| // Write parent tag row (with empty subtag) | |
| fputcsv($fp, array($main_tag, '', $main_count)); | |
| // Write subtag rows if they exist | |
| if (isset($subtag_counts[$main_tag])) { | |
| // Sort subtags by count descending | |
| arsort($subtag_counts[$main_tag]); | |
| foreach ($subtag_counts[$main_tag] as $subtag => $subtag_count) { | |
| fputcsv($fp, array($main_tag, $subtag, $subtag_count)); | |
| } | |
| } | |
| } | |
| fclose($fp); | |
| } | |
| // ============================================================================ | |
| // MAIN EXECUTION | |
| // ============================================================================ | |
| echo "Step 1: Loading tag definitions...\n"; | |
| $tag_definitions = load_tag_definitions(); | |
| $main_tag_count = count($tag_definitions['main_tags']); | |
| $subtag_count = 0; | |
| foreach ($tag_definitions['subtags'] as $children) { | |
| $subtag_count += count($children); | |
| } | |
| echo " Loaded $main_tag_count main tags and $subtag_count subtags\n\n"; | |
| echo "Step 2: Querying posts...\n"; | |
| $posts = get_all_posts(); | |
| $total_posts = count($posts); | |
| if (DEV_MODE) { | |
| echo " Found $total_posts posts (DEV MODE - limited to 100)\n\n"; | |
| } else { | |
| echo " Found $total_posts posts\n\n"; | |
| } | |
| echo "Step 3: Assigning tags with OpenAI API...\n"; | |
| echo " (This may take 5-10 minutes for $total_posts posts)\n"; | |
| $batches = array_chunk($posts, BATCH_SIZE); | |
| $total_batches = count($batches); | |
| $current_batch = 0; | |
| $all_assignments = array(); | |
| // Create a map for quick lookup | |
| $post_data_map = array(); | |
| foreach ($posts as $post) { | |
| $data = get_post_data($post); | |
| $post_data_map[$post->ID] = $data; | |
| } | |
| foreach ($batches as $batch) { | |
| $current_batch++; | |
| // Prepare batch data (without URL for API, to save tokens) | |
| $batch_for_api = array(); | |
| foreach ($batch as $post) { | |
| $batch_for_api[] = array( | |
| 'post_id' => $post->ID, | |
| 'title' => $post->post_title, | |
| 'content' => extract_clean_content($post), | |
| 'categories' => wp_get_post_categories($post->ID, array('fields' => 'names')), | |
| ); | |
| } | |
| echo "Processing batch $current_batch/$total_batches (" . count($batch_for_api) . " posts)...\n"; | |
| $assignments = assign_tags_batch($batch_for_api, $tag_definitions); | |
| if ($assignments) { | |
| // Merge with full post data | |
| foreach ($assignments as $assignment) { | |
| $post_id = $assignment['post_id']; | |
| if (isset($post_data_map[$post_id])) { | |
| $all_assignments[] = array( | |
| 'post_id' => $post_id, | |
| 'url' => $post_data_map[$post_id]['url'], | |
| 'title' => $post_data_map[$post_id]['title'], | |
| 'main_tag' => $assignment['main_tag'], | |
| 'subtag' => $assignment['subtag'] ?? '', | |
| ); | |
| } | |
| } | |
| echo " Batch complete\n"; | |
| } else { | |
| echo " Batch failed, skipping...\n"; | |
| } | |
| // Rate limiting - wait 2 seconds between batches | |
| if ($current_batch < $total_batches) { | |
| sleep(2); | |
| } | |
| } | |
| echo "\n Step 4: Saving results...\n"; | |
| $timestamp = date('Y-m-d-His'); | |
| $assignments_file = OUTPUT_DIR . "article-tag-assignments-{$timestamp}.csv"; | |
| $counts_file = OUTPUT_DIR . "tag-counts-{$timestamp}.csv"; | |
| write_csv($all_assignments, $assignments_file); | |
| echo " Assignments saved to: $assignments_file\n"; | |
| generate_tag_counts($all_assignments, $counts_file); | |
| echo " Tag counts saved to: $counts_file\n\n"; | |
| echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"; | |
| echo "( Tag assignment complete!\n\n"; | |
| echo " Summary:\n"; | |
| echo " Total posts processed: " . count($all_assignments) . " / $total_posts\n"; | |
| echo " Assignments file: $assignments_file\n"; | |
| echo " Tag counts file: $counts_file\n\n"; | |
| // Show top 10 assigned tags | |
| echo " Top 10 Most Assigned Tags:\n"; | |
| $counts = array(); | |
| foreach ($all_assignments as $assignment) { | |
| $main = $assignment['main_tag']; | |
| if (!isset($counts[$main])) { | |
| $counts[$main] = 0; | |
| } | |
| $counts[$main]++; | |
| } | |
| arsort($counts); | |
| $count = 0; | |
| foreach ($counts as $tag => $num) { | |
| echo " • $tag: $num articles\n"; | |
| $count++; | |
| if ($count >= 10) break; | |
| } | |
| echo "\nDone!\n"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment