Instantly share code, notes, and snippets.
Created
December 1, 2025 20:32
-
Star
0
(0)
You must be signed in to star a gist -
Fork
0
(0)
You must be signed in to fork a gist
-
-
Save db-pj/3b7616896f853045cb2a6ec8bb23dbd8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| /** | |
| * Article Tag Assignment Script | |
| * Uses ChatGPT and Perplexity APIs to assign tags and subtags to articles | |
| * | |
| * Usage: php assign-tags.php | |
| */ | |
| // ========================================= | |
| // CONFIGURATION | |
| // ========================================= | |
| // API Models | |
| const OPENAI_MODEL = 'gpt-4o'; | |
| const PERPLEXITY_MODEL = 'sonar-pro'; | |
| // API Pricing (per 1M tokens) | |
| const OPENAI_INPUT_PRICE = 2.50; // $2.50 per 1M input tokens (gpt-4o) | |
| const OPENAI_OUTPUT_PRICE = 10.00; // $10.00 per 1M output tokens (gpt-4o) | |
| const PERPLEXITY_INPUT_PRICE = 1.00; // $1.00 per 1M input tokens (sonar-pro) | |
| const PERPLEXITY_OUTPUT_PRICE = 5.00; // $5.00 per 1M output tokens (sonar-pro) | |
| // LLM Selection - Enable/disable which LLMs to use | |
| const USE_CHATGPT = true; // Run ChatGPT tagging | |
| const USE_PERPLEXITY = false; // Run Perplexity tagging | |
| // Processing Configuration | |
| const BATCH_SIZE = 1; // Number of articles to process per API call (reduced to 1 for full article content) | |
| const DEV_MODE = false; // When true, only process first 100 articles | |
| const MAX_RETRIES = 3; // Maximum retry attempts for failed API calls | |
| const MAX_ARTICLE_WORDS = 10000; // Maximum words from article content (to prevent token limit issues) | |
| // File Paths | |
| const OUTPUT_CSV = __DIR__ . '/output/article-tags.csv'; | |
| const TAG_DEFINITIONS = __DIR__ . '/tag-definitions.json'; | |
| // WordPress Bootstrap | |
| $public_html = getenv('PUBLIC_HTML'); | |
| if (!$public_html || !file_exists($public_html . '/wp-load.php')) { | |
| echo "❌ Error: PUBLIC_HTML environment variable not set or wp-load.php not found\n"; | |
| exit(1); | |
| } | |
| require_once($public_html . '/wp-load.php'); | |
| // API Endpoints | |
| const OPENAI_ENDPOINT = 'https://api.openai.com/v1/chat/completions'; | |
| const PERPLEXITY_ENDPOINT = 'https://api.perplexity.ai/chat/completions'; | |
| // ========================================= | |
| // MAIN SCRIPT | |
| // ========================================= | |
| echo "========================================\n"; | |
| echo "Article Tag Assignment Script\n"; | |
| echo "========================================\n\n"; | |
| // Initialize cost tracking | |
| $tokenUsage = [ | |
| 'chatgpt_input' => 0, | |
| 'chatgpt_output' => 0, | |
| 'perplexity_input' => 0, | |
| 'perplexity_output' => 0, | |
| ]; | |
| // Validate configuration | |
| if (!validateConfig()) { | |
| exit(1); | |
| } | |
| // Load tag definitions | |
| $tagDefinitions = loadTagDefinitions(); | |
| if (!$tagDefinitions) { | |
| echo "❌ Failed to load tag definitions\n"; | |
| exit(1); | |
| } | |
| echo "✅ Loaded " . count($tagDefinitions['main_tags']) . " main tags\n"; | |
| echo "✅ Loaded subtag definitions\n\n"; | |
| // Load articles from database | |
| $articles = loadArticlesFromDatabase(); | |
| if (!$articles) { | |
| echo "❌ Failed to load articles from database\n"; | |
| exit(1); | |
| } | |
| $totalArticles = count($articles); | |
| echo "✅ Loaded {$totalArticles} articles from database\n"; | |
| // Apply DEV mode limit | |
| if (DEV_MODE) { | |
| $articles = array_slice($articles, 0, 50); | |
| echo "🔧 DEV MODE: Limited to " . count($articles) . " articles\n"; | |
| } | |
| // Delete existing output files if they exist (fresh start) | |
| $outputFiles = [ | |
| OUTPUT_CSV, | |
| __DIR__ . '/output/chatgpt-tag-counts.csv', | |
| __DIR__ . '/output/perplexity-tag-counts.csv' | |
| ]; | |
| foreach ($outputFiles as $file) { | |
| if (file_exists($file)) { | |
| unlink($file); | |
| echo "🗑️ Deleted existing file: " . basename($file) . "\n"; | |
| } | |
| } | |
| echo "\n📊 " . count($articles) . " articles to process\n\n"; | |
| // Process articles in batches | |
| $batches = array_chunk($articles, BATCH_SIZE); | |
| $totalBatches = count($batches); | |
| $processedCount = 0; | |
| echo "Starting batch processing ({$totalBatches} batches of " . BATCH_SIZE . " articles)...\n\n"; | |
| foreach ($batches as $batchNum => $batch) { | |
| $batchIndex = $batchNum + 1; | |
| echo "=== Batch {$batchIndex}/{$totalBatches} ===\n"; | |
| $chatgptResults = null; | |
| $perplexityResults = null; | |
| // Process with ChatGPT (if enabled) | |
| if (USE_CHATGPT) { | |
| echo "🤖 Calling ChatGPT API...\n"; | |
| $chatgptResults = processWithChatGPT($batch, $tagDefinitions, $tokenUsage); | |
| } else { | |
| echo "⏭️ ChatGPT disabled - skipping\n"; | |
| } | |
| // Process with Perplexity (if enabled) | |
| if (USE_PERPLEXITY) { | |
| echo "🔍 Calling Perplexity API...\n"; | |
| $perplexityResults = processWithPerplexity($batch, $tagDefinitions, $tokenUsage); | |
| } else { | |
| echo "⏭️ Perplexity disabled - skipping\n"; | |
| } | |
| // Combine results and write to CSV | |
| $combinedResults = combineResults($batch, $chatgptResults, $perplexityResults); | |
| appendToOutputCSV($combinedResults); | |
| $processedCount += count($batch); | |
| echo "✅ Batch complete ({$processedCount}/{$remainingCount} articles processed)\n\n"; | |
| // Small delay between batches to avoid rate limiting | |
| if ($batchIndex < $totalBatches) { | |
| sleep(1); | |
| } | |
| } | |
| // Generate tag count summaries | |
| echo "\n📊 Generating tag count summaries...\n"; | |
| generateTagCounts(); | |
| // Calculate costs | |
| $chatgptCost = ($tokenUsage['chatgpt_input'] / 1000000 * OPENAI_INPUT_PRICE) + | |
| ($tokenUsage['chatgpt_output'] / 1000000 * OPENAI_OUTPUT_PRICE); | |
| $perplexityCost = ($tokenUsage['perplexity_input'] / 1000000 * PERPLEXITY_INPUT_PRICE) + | |
| ($tokenUsage['perplexity_output'] / 1000000 * PERPLEXITY_OUTPUT_PRICE); | |
| $totalCost = $chatgptCost + $perplexityCost; | |
| echo "\n========================================\n"; | |
| echo "✅ Processing complete!\n"; | |
| echo "========================================\n"; | |
| echo "📄 Results: " . OUTPUT_CSV . "\n"; | |
| if (USE_CHATGPT) { | |
| echo "📊 ChatGPT counts: " . __DIR__ . '/output/chatgpt-tag-counts.csv' . "\n"; | |
| } | |
| if (USE_PERPLEXITY) { | |
| echo "📊 Perplexity counts: " . __DIR__ . '/output/perplexity-tag-counts.csv' . "\n"; | |
| } | |
| echo "\n"; | |
| echo "💰 API Cost Estimate:\n"; | |
| if (USE_CHATGPT) { | |
| echo " ChatGPT: $" . number_format($chatgptCost, 4) . " "; | |
| echo "(" . number_format($tokenUsage['chatgpt_input']) . " in / "; | |
| echo number_format($tokenUsage['chatgpt_output']) . " out tokens)\n"; | |
| } | |
| if (USE_PERPLEXITY) { | |
| echo " Perplexity: $" . number_format($perplexityCost, 4) . " "; | |
| echo "(" . number_format($tokenUsage['perplexity_input']) . " in / "; | |
| echo number_format($tokenUsage['perplexity_output']) . " out tokens)\n"; | |
| } | |
| if (USE_CHATGPT || USE_PERPLEXITY) { | |
| echo " Total: $" . number_format($totalCost, 4) . "\n"; | |
| } | |
| echo "========================================\n"; | |
| // ========================================= | |
| // FUNCTIONS | |
| // ========================================= | |
| /** | |
| * Validate configuration | |
| */ | |
| function validateConfig() { | |
| $errors = []; | |
| // Check that at least one LLM is enabled | |
| if (!USE_CHATGPT && !USE_PERPLEXITY) { | |
| $errors[] = "At least one LLM must be enabled (USE_CHATGPT or USE_PERPLEXITY)"; | |
| } | |
| // Only validate API keys for enabled LLMs | |
| if (USE_CHATGPT && OPENAI_API_KEY === 'your-openai-api-key-here') { | |
| $errors[] = "OpenAI API key not configured (required when USE_CHATGPT is true)"; | |
| } | |
| if (USE_PERPLEXITY && PERPLEXITY_API_KEY === 'your-perplexity-api-key-here') { | |
| $errors[] = "Perplexity API key not configured (required when USE_PERPLEXITY is true)"; | |
| } | |
| if (!file_exists(TAG_DEFINITIONS)) { | |
| $errors[] = "Tag definitions file not found: " . TAG_DEFINITIONS; | |
| } | |
| // Create output directory if it doesn't exist | |
| $outputDir = dirname(OUTPUT_CSV); | |
| if (!is_dir($outputDir)) { | |
| mkdir($outputDir, 0755, true); | |
| } | |
| if (!empty($errors)) { | |
| echo "❌ Configuration errors:\n"; | |
| foreach ($errors as $error) { | |
| echo " - {$error}\n"; | |
| } | |
| return false; | |
| } | |
| return true; | |
| } | |
| /** | |
| * Load tag definitions from JSON file | |
| */ | |
| function loadTagDefinitions() { | |
| $json = file_get_contents(TAG_DEFINITIONS); | |
| return json_decode($json, true); | |
| } | |
| /** | |
| * Load articles from WordPress database | |
| */ | |
| function loadArticlesFromDatabase() { | |
| echo "🔍 Querying WordPress database...\n"; | |
| $articles = []; | |
| $posts_per_page = 100; | |
| // Card issuers to match | |
| $card_issuers = [ | |
| 'Chase', 'American Express', 'Amex', 'Citi', 'Citibank', | |
| 'Bank of America', 'Capital One', 'Wells Fargo', 'Discover', | |
| 'Barclays', 'U.S. Bank', 'PNC', 'TD Bank', 'USAA', | |
| 'Navy Federal', 'Synchrony', 'Apple', 'Goldman Sachs' | |
| ]; | |
| // Offer features patterns | |
| $offer_features_patterns = [ | |
| 'no_annual_fee' => ['no annual fee', '$0 annual fee', 'no yearly fee', 'zero annual fee'], | |
| '0_intro_apr_purchases' => ['0% intro apr on purchases', '0% apr on purchases', 'intro apr purchases'], | |
| '0_intro_apr_bt' => ['0% intro apr on balance transfers', '0% balance transfer', 'intro apr balance transfer'], | |
| 'welcome_bonus' => ['welcome bonus', 'sign-up bonus', 'signup bonus', 'intro bonus'], | |
| 'cash_back' => ['cash back', 'cashback'], | |
| 'points' => ['points', 'reward points'], | |
| 'miles' => ['miles', 'airline miles', 'travel miles'], | |
| 'secured' => ['secured card', 'secured credit card'], | |
| 'student' => ['student card', 'student credit card'], | |
| 'business' => ['business card', 'business credit card'], | |
| 'prequal' => ['prequalify', 'pre-qualify', 'prequalification', 'check eligibility'], | |
| 'instant_number' => ['instant card number', 'instant approval', 'use instantly'] | |
| ]; | |
| // Count total posts | |
| $post_types = ['post']; | |
| $total_count = 0; | |
| foreach ($post_types as $post_type) { | |
| $post_type_count = wp_count_posts($post_type); | |
| foreach ($post_type_count as $count) { | |
| $total_count += $count; | |
| } | |
| } | |
| $number_of_pages = ceil($total_count / $posts_per_page); | |
| // Query posts in batches | |
| for ($paged = 1; $paged <= $number_of_pages; $paged++) { | |
| $query_args = [ | |
| 'posts_per_page' => $posts_per_page, | |
| 'paged' => $paged, | |
| 'order' => 'ASC', | |
| 'orderby' => 'date', | |
| 'post_type' => $post_types, | |
| 'post_status' => 'publish', | |
| 'update_post_term_cache' => true, | |
| ]; | |
| $query = new WP_Query($query_args); | |
| while ($query->have_posts()) { | |
| $query->the_post(); | |
| $post_id = get_the_ID(); | |
| // URL | |
| $url = get_permalink(); | |
| // Title | |
| $title = do_shortcode(get_the_title()); | |
| // H1 (if different) | |
| $h1 = ''; | |
| if (function_exists('get_full_review_content_data')) { | |
| $full_review_content_data = get_full_review_content_data($post_id); | |
| if ($full_review_content_data && $full_review_content_data['enabled'] && $full_review_content_data['id']) { | |
| $h1_title = do_shortcode(get_the_title($full_review_content_data['id'])); | |
| if ($h1_title !== $title) { | |
| $h1 = $h1_title; | |
| } | |
| } | |
| } | |
| // Section / Category | |
| $section_category = ''; | |
| $categories = get_the_category($post_id); | |
| if ($categories && !is_wp_error($categories)) { | |
| $primary_cat = $categories[0]; | |
| $section_category = $primary_cat->name; | |
| } | |
| // Meta Description | |
| $meta_description = get_post_meta($post_id, '_yoast_wpseo_metadesc', true); | |
| // Get raw content for parsing | |
| $raw_content = get_the_content(null, false, $post_id); | |
| // Full Article Content (cleaned for LLM, with word limit) | |
| $clean_content = strip_shortcodes($raw_content); | |
| $clean_content = wp_strip_all_tags($clean_content); | |
| $clean_content = preg_replace('/\s+/', ' ', $clean_content); | |
| $clean_content = trim($clean_content); | |
| // Limit to MAX_ARTICLE_WORDS to prevent token limit issues | |
| $words = explode(' ', $clean_content); | |
| if (count($words) > MAX_ARTICLE_WORDS) { | |
| $words = array_slice($words, 0, MAX_ARTICLE_WORDS); | |
| $full_article_content = implode(' ', $words) . ' [Content truncated - article exceeds ' . MAX_ARTICLE_WORDS . ' words]'; | |
| } else { | |
| $full_article_content = $clean_content; | |
| } | |
| // Card Issuer Mentions | |
| $issuer_mentions = []; | |
| foreach ($card_issuers as $issuer) { | |
| if (stripos($raw_content, $issuer) !== false) { | |
| $normalized_issuer = ($issuer === 'Amex') ? 'American Express' : $issuer; | |
| if (!in_array($normalized_issuer, $issuer_mentions)) { | |
| $issuer_mentions[] = $normalized_issuer; | |
| } | |
| } | |
| } | |
| $card_issuer_mentions = implode(', ', $issuer_mentions); | |
| // Card Product Mentions | |
| $card_mentions = []; | |
| if (preg_match_all('/\[(?:offer-jump|jump_link|card_field)[^\]]*id=["\']?(\d+)["\']?[^\]]*\]/i', $raw_content, $matches)) { | |
| foreach ($matches[1] as $card_id) { | |
| $card_title = get_the_title($card_id); | |
| if ($card_title && !in_array($card_title, $card_mentions)) { | |
| $card_mentions[] = $card_title; | |
| } | |
| } | |
| } | |
| $card_product_mentions = implode(', ', $card_mentions); | |
| // Offer Features | |
| $detected_features = []; | |
| $content_lower = strtolower($raw_content); | |
| foreach ($offer_features_patterns as $feature_key => $patterns) { | |
| foreach ($patterns as $pattern) { | |
| if (stripos($content_lower, strtolower($pattern)) !== false) { | |
| $detected_features[] = $feature_key; | |
| break; | |
| } | |
| } | |
| } | |
| $offer_features = implode(', ', $detected_features); | |
| // Existing Tags | |
| $existing_tags = ''; | |
| $tags = get_the_tags($post_id); | |
| if ($tags && !is_wp_error($tags)) { | |
| $tag_names = []; | |
| foreach ($tags as $tag) { | |
| $tag_names[] = $tag->name; | |
| } | |
| $existing_tags = implode(', ', $tag_names); | |
| } | |
| // Top Keywords | |
| $top_keywords = get_post_meta($post_id, '_yoast_wpseo_focuskw', true); | |
| $articles[] = [ | |
| 'post_id' => $post_id, | |
| 'url' => $url, | |
| 'title' => $title, | |
| 'h1_if_different' => $h1, | |
| 'section_category' => $section_category, | |
| 'meta_description' => $meta_description, | |
| 'full_article_content' => $full_article_content, | |
| 'card_issuer_mentions' => $card_issuer_mentions, | |
| 'card_product_mentions' => $card_product_mentions, | |
| 'offer_features' => $offer_features, | |
| 'existing_tags' => $existing_tags, | |
| 'top_keywords' => $top_keywords, | |
| ]; | |
| } | |
| wp_reset_postdata(); | |
| wp_reset_query(); | |
| wp_cache_flush(); | |
| } | |
| return $articles; | |
| } | |
| /** | |
| * Process batch with ChatGPT | |
| */ | |
| function processWithChatGPT($batch, $tagDefinitions, &$tokenUsage) { | |
| $prompt = buildPrompt($batch, $tagDefinitions); | |
| $schema = buildResponseSchema(count($batch)); | |
| $requestData = [ | |
| 'model' => OPENAI_MODEL, | |
| 'messages' => [ | |
| [ | |
| 'role' => 'system', | |
| 'content' => 'You are an expert content taxonomist specializing in credit card and personal finance content. Your task is to accurately categorize articles based on their content and assign appropriate tags.' | |
| ], | |
| [ | |
| 'role' => 'user', | |
| 'content' => $prompt | |
| ] | |
| ], | |
| 'response_format' => [ | |
| 'type' => 'json_schema', | |
| 'json_schema' => [ | |
| 'name' => 'article_tagging_response', | |
| 'strict' => true, | |
| 'schema' => $schema | |
| ] | |
| ], | |
| 'temperature' => 0.3, | |
| 'max_completion_tokens' => 1000 // Reduced from 4096 - we only need tags back | |
| ]; | |
| $result = makeAPICall( | |
| OPENAI_ENDPOINT, | |
| OPENAI_API_KEY, | |
| $requestData, | |
| 'ChatGPT', | |
| $batch[0]['url'] ?? 'unknown' // Pass URL for error logging | |
| ); | |
| if ($result === false) { | |
| // Return error results for all articles in batch | |
| return array_fill(0, count($batch), [ | |
| 'tag' => 'MANUAL_REVIEW', | |
| 'subtag' => '', | |
| 'error' => true | |
| ]); | |
| } | |
| // Extract token usage from response | |
| if (isset($result['usage'])) { | |
| $tokenUsage['chatgpt_input'] += $result['usage']['prompt_tokens'] ?? 0; | |
| $tokenUsage['chatgpt_output'] += $result['usage']['completion_tokens'] ?? 0; | |
| } | |
| return parseAPIResponse($result, count($batch)); | |
| } | |
| /** | |
| * Process batch with Perplexity | |
| */ | |
| function processWithPerplexity($batch, $tagDefinitions, &$tokenUsage) { | |
| $prompt = buildPrompt($batch, $tagDefinitions); | |
| $schema = buildResponseSchema(count($batch)); | |
| $requestData = [ | |
| 'model' => PERPLEXITY_MODEL, | |
| 'messages' => [ | |
| [ | |
| 'role' => 'system', | |
| 'content' => 'You are an expert content taxonomist specializing in credit card and personal finance content. Your task is to accurately categorize articles based on their content and assign appropriate tags.' | |
| ], | |
| [ | |
| 'role' => 'user', | |
| 'content' => $prompt | |
| ] | |
| ], | |
| 'response_format' => [ | |
| 'type' => 'json_schema', | |
| 'json_schema' => [ | |
| 'strict' => true, | |
| 'schema' => $schema | |
| ] | |
| ], | |
| 'temperature' => 0.3, | |
| 'max_tokens' => 1000 // Reduced - we only need tags back | |
| ]; | |
| $result = makeAPICall( | |
| PERPLEXITY_ENDPOINT, | |
| PERPLEXITY_API_KEY, | |
| $requestData, | |
| 'Perplexity', | |
| $batch[0]['url'] ?? 'unknown' // Pass URL for error logging | |
| ); | |
| if ($result === false) { | |
| // Return error results for all articles in batch | |
| return array_fill(0, count($batch), [ | |
| 'tag' => 'MANUAL_REVIEW', | |
| 'subtag' => '', | |
| 'error' => true | |
| ]); | |
| } | |
| // Extract token usage from response | |
| if (isset($result['usage'])) { | |
| $tokenUsage['perplexity_input'] += $result['usage']['prompt_tokens'] ?? 0; | |
| $tokenUsage['perplexity_output'] += $result['usage']['completion_tokens'] ?? 0; | |
| } | |
| return parseAPIResponse($result, count($batch)); | |
| } | |
| /** | |
| * Build prompt for API call | |
| */ | |
| function buildPrompt($batch, $tagDefinitions) { | |
| $prompt = "Please analyze the following articles and assign appropriate tags based on the tag definitions provided.\n\n"; | |
| // Add tag definitions | |
| $prompt .= "=== MAIN TAG DEFINITIONS ===\n\n"; | |
| foreach ($tagDefinitions['main_tags'] as $tagName => $tagInfo) { | |
| $prompt .= "**{$tagName}**\n"; | |
| $prompt .= "{$tagInfo['definition']}\n\n"; | |
| } | |
| $prompt .= "\n=== SUBTAG DEFINITIONS ===\n\n"; | |
| foreach ($tagDefinitions['subtags'] as $mainTag => $subtags) { | |
| $prompt .= "**{$mainTag}** subtags:\n"; | |
| foreach ($subtags as $subtagName => $subtagInfo) { | |
| $prompt .= " - {$subtagName}: {$subtagInfo['definition']}\n"; | |
| } | |
| $prompt .= "\n"; | |
| } | |
| $prompt .= "\n=== TAGGING RULES ===\n\n"; | |
| $prompt .= "1. Each article MUST be assigned exactly ONE main tag\n"; | |
| $prompt .= "2. Each article MAY be assigned ONE subtag (or none) based on relevance and the article's main tag. For example, an article about a credit card that offers everyday purchases cash back may get the subtag 'Everyday Purchases' if the main tag is 'Cash Back Credit Cards'.\n"; | |
| $prompt .= "3. **CRITICAL CONSTRAINT - SUBTAG VALIDATION**: \n"; | |
| $prompt .= " - Subtags can ONLY be assigned if they belong to the article's assigned main tag\n"; | |
| $prompt .= " - NEVER assign a subtag from a different main tag category\n"; | |
| $prompt .= " - Before assigning a subtag, verify: Does this subtag appear in the subtag list for my chosen main tag?\n"; | |
| $prompt .= " - If the answer is NO, do not assign that subtag. Leave it empty instead.\n"; | |
| $prompt .= " - Example: 'Everyday Purchases' is ONLY valid for 'Cash Back Credit Cards', not for any other main tag\n"; | |
| $prompt .= "4. **CRITICAL WEIGHTING**: The article TITLE should carry significantly more weight than the article content when determining tag relevance\n"; | |
| $prompt .= " - Title = 60% importance (1.5x the weight of content)\n"; | |
| $prompt .= " - Full Article Content = 40% importance\n"; | |
| $prompt .= "5. First, determine what the title indicates the article is about\n"; | |
| $prompt .= "6. Then, read the full article content to confirm and refine your understanding\n"; | |
| $prompt .= "7. If the title and content suggest different tags, prioritize the title\n"; | |
| $prompt .= "8. Use meta description and section category as minor supporting signals only\n\n"; | |
| $prompt .= "**REMINDER**: Before finalizing your response, verify that any assigned subtag belongs to the article's assigned main tag. Cross-check the subtag definitions above.\n\n"; | |
| $prompt .= "=== ARTICLES TO TAG ===\n\n"; | |
| foreach ($batch as $index => $article) { | |
| $articleNum = $index + 1; | |
| $prompt .= "Article #{$articleNum}:\n"; | |
| $prompt .= "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"; | |
| $prompt .= "**TITLE (60% WEIGHT)**: {$article['title']}\n"; | |
| $prompt .= "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"; | |
| $prompt .= "**FULL ARTICLE CONTENT (40% WEIGHT)**:\n"; | |
| if (!empty($article['full_article_content'])) { | |
| $prompt .= $article['full_article_content'] . "\n\n"; | |
| } | |
| $prompt .= "**SUPPORTING METADATA**:\n"; | |
| $prompt .= "URL: {$article['url']}\n"; | |
| if (!empty($article['meta_description'])) { | |
| $prompt .= "Meta Description: {$article['meta_description']}\n"; | |
| } | |
| if (!empty($article['section_category'])) { | |
| $prompt .= "Section: {$article['section_category']}\n"; | |
| } | |
| $prompt .= "\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"; | |
| } | |
| $prompt .= "REMINDER: The article TITLE should be your PRIMARY signal (60% weight). Use the full article content to confirm and understand context (40% weight), but if there's any conflict, the title takes precedence."; | |
| return $prompt; | |
| } | |
| /** | |
| * Build JSON schema for API response | |
| */ | |
| function buildResponseSchema($articleCount) { | |
| $properties = []; | |
| for ($i = 1; $i <= $articleCount; $i++) { | |
| $properties["article_{$i}"] = [ | |
| 'type' => 'object', | |
| 'properties' => [ | |
| 'main_tag' => [ | |
| 'type' => 'string', | |
| 'description' => 'The main tag assigned to this article' | |
| ], | |
| 'subtag' => [ | |
| 'type' => 'string', | |
| 'description' => 'The subtag assigned to this article (empty string if none)' | |
| ] | |
| ], | |
| 'required' => ['main_tag', 'subtag'], | |
| 'additionalProperties' => false | |
| ]; | |
| } | |
| return [ | |
| 'type' => 'object', | |
| 'properties' => $properties, | |
| 'required' => array_keys($properties), | |
| 'additionalProperties' => false | |
| ]; | |
| } | |
| /** | |
| * Make API call with exponential backoff retry | |
| */ | |
| function makeAPICall($endpoint, $apiKey, $requestData, $providerName, $articleUrl = 'unknown') { | |
| $headers = [ | |
| 'Authorization: Bearer ' . $apiKey, | |
| 'Content-Type: application/json' | |
| ]; | |
| $attempt = 0; | |
| while ($attempt < MAX_RETRIES) { | |
| $attempt++; | |
| $ch = curl_init($endpoint); | |
| curl_setopt($ch, CURLOPT_POST, true); | |
| curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($requestData)); | |
| curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); | |
| curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |
| curl_setopt($ch, CURLOPT_TIMEOUT, 300); // Increased from 120 to 300 seconds for longer articles | |
| $response = curl_exec($ch); | |
| $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); | |
| $error = curl_error($ch); | |
| curl_close($ch); | |
| if ($error) { | |
| echo " ⚠️ {$providerName} API error (attempt {$attempt}) for {$articleUrl}: {$error}\n"; | |
| } elseif ($httpCode === 200) { | |
| return json_decode($response, true); | |
| } elseif ($httpCode === 429) { | |
| // Rate limit - wait longer | |
| $waitTime = pow(2, $attempt) * 5; | |
| echo " ⚠️ {$providerName} rate limit hit (attempt {$attempt}) for {$articleUrl}. Waiting {$waitTime}s...\n"; | |
| sleep($waitTime); | |
| continue; | |
| } elseif ($httpCode === 413) { | |
| // Payload too large | |
| echo " ⚠️ {$providerName} payload too large (attempt {$attempt}) for {$articleUrl} - article content may be too long\n"; | |
| echo " Try reducing MAX_ARTICLE_WORDS in the script\n"; | |
| return false; // Don't retry payload too large errors | |
| } else { | |
| echo " ⚠️ {$providerName} API returned status {$httpCode} (attempt {$attempt}) for {$articleUrl}\n"; | |
| $errorData = json_decode($response, true); | |
| if ($errorData && isset($errorData['error']['message'])) { | |
| echo " Error: {$errorData['error']['message']}\n"; | |
| } elseif ($response) { | |
| echo " Response: " . substr($response, 0, 500) . "\n"; | |
| } | |
| } | |
| // Exponential backoff for retries | |
| if ($attempt < MAX_RETRIES) { | |
| $waitTime = pow(2, $attempt); | |
| echo " ⏳ Retrying in {$waitTime}s...\n"; | |
| sleep($waitTime); | |
| } | |
| } | |
| echo " ❌ {$providerName} API failed after " . MAX_RETRIES . " attempts for {$articleUrl}\n"; | |
| return false; | |
| } | |
| /** | |
| * Parse API response | |
| */ | |
| function parseAPIResponse($apiResponse, $expectedCount) { | |
| if (!isset($apiResponse['choices'][0]['message']['content'])) { | |
| return array_fill(0, $expectedCount, [ | |
| 'tag' => 'MANUAL_REVIEW', | |
| 'subtag' => '', | |
| 'error' => true | |
| ]); | |
| } | |
| $content = json_decode($apiResponse['choices'][0]['message']['content'], true); | |
| if (!$content) { | |
| return array_fill(0, $expectedCount, [ | |
| 'tag' => 'MANUAL_REVIEW', | |
| 'subtag' => '', | |
| 'error' => true | |
| ]); | |
| } | |
| $results = []; | |
| for ($i = 1; $i <= $expectedCount; $i++) { | |
| $key = "article_{$i}"; | |
| if (isset($content[$key])) { | |
| $results[] = [ | |
| 'tag' => $content[$key]['main_tag'] ?? 'MANUAL_REVIEW', | |
| 'subtag' => $content[$key]['subtag'] ?? '', | |
| 'error' => false | |
| ]; | |
| } else { | |
| $results[] = [ | |
| 'tag' => 'MANUAL_REVIEW', | |
| 'subtag' => '', | |
| 'error' => true | |
| ]; | |
| } | |
| } | |
| return $results; | |
| } | |
| /** | |
| * Combine results from both APIs | |
| */ | |
| function combineResults($batch, $chatgptResults, $perplexityResults) { | |
| $combined = []; | |
| foreach ($batch as $index => $article) { | |
| // Start with base columns | |
| $row = [ | |
| 'post_id' => $article['post_id'], | |
| 'url' => $article['url'], | |
| 'title' => $article['title'] | |
| ]; | |
| // Add ChatGPT columns if enabled | |
| if (USE_CHATGPT) { | |
| $chatgpt = $chatgptResults ? $chatgptResults[$index] : ['tag' => 'N/A', 'subtag' => '']; | |
| $row['chatgpt_tag'] = $chatgpt['tag']; | |
| $row['chatgpt_subtag'] = $chatgpt['subtag']; | |
| } | |
| // Add Perplexity columns if enabled | |
| if (USE_PERPLEXITY) { | |
| $perplexity = $perplexityResults ? $perplexityResults[$index] : ['tag' => 'N/A', 'subtag' => '']; | |
| $row['perplexity_tag'] = $perplexity['tag']; | |
| $row['perplexity_subtag'] = $perplexity['subtag']; | |
| } | |
| // Add tags_match column only if both LLMs are enabled | |
| if (USE_CHATGPT && USE_PERPLEXITY) { | |
| $chatgpt = $chatgptResults ? $chatgptResults[$index] : ['tag' => 'N/A', 'subtag' => '']; | |
| $perplexity = $perplexityResults ? $perplexityResults[$index] : ['tag' => 'N/A', 'subtag' => '']; | |
| $tagsMatch = ($chatgpt['tag'] === $perplexity['tag'] && | |
| $chatgpt['subtag'] === $perplexity['subtag']) ? 'yes' : 'no'; | |
| $row['tags_match'] = $tagsMatch; | |
| } | |
| $combined[] = $row; | |
| } | |
| return $combined; | |
| } | |
| /** | |
| * Append results to output CSV | |
| */ | |
| function appendToOutputCSV($results) { | |
| $fileExists = file_exists(OUTPUT_CSV); | |
| $file = fopen(OUTPUT_CSV, 'a'); | |
| if (!$file) { | |
| echo " ❌ Failed to open output file\n"; | |
| return; | |
| } | |
| // Write header if file is new | |
| if (!$fileExists) { | |
| // Build header dynamically based on enabled LLMs | |
| $header = ['post_id', 'url', 'title']; | |
| if (USE_CHATGPT) { | |
| $header[] = 'chatgpt_tag'; | |
| $header[] = 'chatgpt_subtag'; | |
| } | |
| if (USE_PERPLEXITY) { | |
| $header[] = 'perplexity_tag'; | |
| $header[] = 'perplexity_subtag'; | |
| } | |
| // Only include tags_match if both LLMs are enabled | |
| if (USE_CHATGPT && USE_PERPLEXITY) { | |
| $header[] = 'tags_match'; | |
| } | |
| fputcsv($file, $header); | |
| } | |
| // Write results | |
| foreach ($results as $result) { | |
| fputcsv($file, $result); | |
| } | |
| fclose($file); | |
| } | |
| /** | |
| * Generate tag count summaries from the output CSV | |
| */ | |
| function generateTagCounts() { | |
| if (!file_exists(OUTPUT_CSV)) { | |
| echo " ⚠️ No output CSV found to generate counts\n"; | |
| return; | |
| } | |
| // Read all results from output CSV | |
| $file = fopen(OUTPUT_CSV, 'r'); | |
| if (!$file) { | |
| echo " ❌ Failed to open output CSV\n"; | |
| return; | |
| } | |
| // Read header to determine column positions | |
| $header = fgetcsv($file); | |
| if (!$header) { | |
| echo " ❌ Failed to read CSV header\n"; | |
| fclose($file); | |
| return; | |
| } | |
| // Find column indices dynamically | |
| $columnMap = array_flip($header); | |
| $chatgptTagIdx = $columnMap['chatgpt_tag'] ?? null; | |
| $chatgptSubtagIdx = $columnMap['chatgpt_subtag'] ?? null; | |
| $perplexityTagIdx = $columnMap['perplexity_tag'] ?? null; | |
| $perplexitySubtagIdx = $columnMap['perplexity_subtag'] ?? null; | |
| // Initialize counters | |
| $chatgptCounts = []; | |
| $perplexityCounts = []; | |
| // Read and count | |
| while (($row = fgetcsv($file)) !== false) { | |
| // Count ChatGPT tags (only if ChatGPT columns exist) | |
| if ($chatgptTagIdx !== null && USE_CHATGPT) { | |
| $chatgptTag = $row[$chatgptTagIdx] ?? ''; | |
| $chatgptSubtag = $chatgptSubtagIdx !== null ? ($row[$chatgptSubtagIdx] ?? '') : ''; | |
| if (!empty($chatgptTag) && $chatgptTag !== 'MANUAL_REVIEW' && $chatgptTag !== 'N/A') { | |
| // Main tag count | |
| $mainKey = $chatgptTag . '||'; | |
| if (!isset($chatgptCounts[$mainKey])) { | |
| $chatgptCounts[$mainKey] = ['tag' => $chatgptTag, 'subtag' => '', 'count' => 0]; | |
| } | |
| $chatgptCounts[$mainKey]['count']++; | |
| // Subtag count | |
| if (!empty($chatgptSubtag)) { | |
| $subKey = $chatgptTag . '||' . $chatgptSubtag; | |
| if (!isset($chatgptCounts[$subKey])) { | |
| $chatgptCounts[$subKey] = ['tag' => $chatgptTag, 'subtag' => $chatgptSubtag, 'count' => 0]; | |
| } | |
| $chatgptCounts[$subKey]['count']++; | |
| } | |
| } | |
| } | |
| // Count Perplexity tags (only if Perplexity columns exist) | |
| if ($perplexityTagIdx !== null && USE_PERPLEXITY) { | |
| $perplexityTag = $row[$perplexityTagIdx] ?? ''; | |
| $perplexitySubtag = $perplexitySubtagIdx !== null ? ($row[$perplexitySubtagIdx] ?? '') : ''; | |
| if (!empty($perplexityTag) && $perplexityTag !== 'MANUAL_REVIEW' && $perplexityTag !== 'N/A') { | |
| // Main tag count | |
| $mainKey = $perplexityTag . '||'; | |
| if (!isset($perplexityCounts[$mainKey])) { | |
| $perplexityCounts[$mainKey] = ['tag' => $perplexityTag, 'subtag' => '', 'count' => 0]; | |
| } | |
| $perplexityCounts[$mainKey]['count']++; | |
| // Subtag count | |
| if (!empty($perplexitySubtag)) { | |
| $subKey = $perplexityTag . '||' . $perplexitySubtag; | |
| if (!isset($perplexityCounts[$subKey])) { | |
| $perplexityCounts[$subKey] = ['tag' => $perplexityTag, 'subtag' => $perplexitySubtag, 'count' => 0]; | |
| } | |
| $perplexityCounts[$subKey]['count']++; | |
| } | |
| } | |
| } | |
| } | |
| fclose($file); | |
| // Sort counts by tag name, then by subtag | |
| usort($chatgptCounts, function($a, $b) { | |
| if ($a['tag'] !== $b['tag']) { | |
| return strcmp($a['tag'], $b['tag']); | |
| } | |
| return strcmp($a['subtag'], $b['subtag']); | |
| }); | |
| usort($perplexityCounts, function($a, $b) { | |
| if ($a['tag'] !== $b['tag']) { | |
| return strcmp($a['tag'], $b['tag']); | |
| } | |
| return strcmp($a['subtag'], $b['subtag']); | |
| }); | |
| // Write ChatGPT counts (only if ChatGPT is enabled) | |
| if (USE_CHATGPT && !empty($chatgptCounts)) { | |
| $chatgptCountsFile = __DIR__ . '/output/chatgpt-tag-counts.csv'; | |
| $file = fopen($chatgptCountsFile, 'w'); | |
| if ($file) { | |
| fputcsv($file, ['Main Tag', 'Subtag', 'Article Count']); | |
| foreach ($chatgptCounts as $count) { | |
| fputcsv($file, [$count['tag'], $count['subtag'], $count['count']]); | |
| } | |
| fclose($file); | |
| echo " ✅ ChatGPT counts saved\n"; | |
| } | |
| } elseif (USE_CHATGPT) { | |
| echo " ⚠️ No ChatGPT tags to count\n"; | |
| } else { | |
| echo " ⏭️ ChatGPT disabled - no counts to generate\n"; | |
| } | |
| // Write Perplexity counts (only if Perplexity is enabled) | |
| if (USE_PERPLEXITY && !empty($perplexityCounts)) { | |
| $perplexityCountsFile = __DIR__ . '/output/perplexity-tag-counts.csv'; | |
| $file = fopen($perplexityCountsFile, 'w'); | |
| if ($file) { | |
| fputcsv($file, ['Main Tag', 'Subtag', 'Article Count']); | |
| foreach ($perplexityCounts as $count) { | |
| fputcsv($file, [$count['tag'], $count['subtag'], $count['count']]); | |
| } | |
| fclose($file); | |
| echo " ✅ Perplexity counts saved\n"; | |
| } | |
| } elseif (USE_PERPLEXITY) { | |
| echo " ⚠️ No Perplexity tags to count\n"; | |
| } else { | |
| echo " ⏭️ Perplexity disabled - no counts to generate\n"; | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment