db-pj · December 1, 2025 20:32
diff --git a/assign-tags.php b/assign-tags.php
 <?php
 /**
 * Article Tag Assignment Script
 * Uses ChatGPT and Perplexity APIs to assign tags and subtags to articles
 *
 * Usage: php assign-tags.php
 */

 // =========================================
 // CONFIGURATION
 // =========================================


 // API Models
 const OPENAI_MODEL = 'gpt-4o';
 const PERPLEXITY_MODEL = 'sonar-pro';

 // API Pricing (per 1M tokens)
 const OPENAI_INPUT_PRICE = 2.50;   // $2.50 per 1M input tokens (gpt-4o)
 const OPENAI_OUTPUT_PRICE = 10.00; // $10.00 per 1M output tokens (gpt-4o)
 const PERPLEXITY_INPUT_PRICE = 1.00;   // $1.00 per 1M input tokens (sonar-pro)
 const PERPLEXITY_OUTPUT_PRICE = 5.00;  // $5.00 per 1M output tokens (sonar-pro)

 // LLM Selection - Enable/disable which LLMs to use
 const USE_CHATGPT = true;      // Run ChatGPT tagging
 const USE_PERPLEXITY = false;  // Run Perplexity tagging

 // Processing Configuration
 const BATCH_SIZE = 1;  // Number of articles to process per API call (reduced to 1 for full article content)
 const DEV_MODE = false;  // When true, only process first 100 articles
 const MAX_RETRIES = 3;  // Maximum retry attempts for failed API calls
 const MAX_ARTICLE_WORDS = 10000;  // Maximum words from article content (to prevent token limit issues)

 // File Paths
 const OUTPUT_CSV = __DIR__ . '/output/article-tags.csv';
 const TAG_DEFINITIONS = __DIR__ . '/tag-definitions.json';

 // WordPress Bootstrap
 $public_html = getenv('PUBLIC_HTML');
 if (!$public_html || !file_exists($public_html . '/wp-load.php')) {
    echo "❌ Error: PUBLIC_HTML environment variable not set or wp-load.php not found\n";
    exit(1);
 }
 require_once($public_html . '/wp-load.php');

 // API Endpoints
 const OPENAI_ENDPOINT = 'https://api.openai.com/v1/chat/completions';
 const PERPLEXITY_ENDPOINT = 'https://api.perplexity.ai/chat/completions';

 // =========================================
 // MAIN SCRIPT
 // =========================================

 echo "========================================\n";
 echo "Article Tag Assignment Script\n";
 echo "========================================\n\n";

 // Initialize cost tracking
 $tokenUsage = [
    'chatgpt_input' => 0,
    'chatgpt_output' => 0,
    'perplexity_input' => 0,
    'perplexity_output' => 0,
 ];

 // Validate configuration
 if (!validateConfig()) {
    exit(1);
 }

 // Load tag definitions
 $tagDefinitions = loadTagDefinitions();
 if (!$tagDefinitions) {
    echo "❌ Failed to load tag definitions\n";
    exit(1);
 }

 echo "✅ Loaded " . count($tagDefinitions['main_tags']) . " main tags\n";
 echo "✅ Loaded subtag definitions\n\n";

 // Load articles from database
 $articles = loadArticlesFromDatabase();
 if (!$articles) {
    echo "❌ Failed to load articles from database\n";
    exit(1);
 }

 $totalArticles = count($articles);
 echo "✅ Loaded {$totalArticles} articles from database\n";

 // Apply DEV mode limit
 if (DEV_MODE) {
    $articles = array_slice($articles, 0, 50);
    echo "🔧 DEV MODE: Limited to " . count($articles) . " articles\n";
 }

 // Delete existing output files if they exist (fresh start)
 $outputFiles = [
    OUTPUT_CSV,
    __DIR__ . '/output/chatgpt-tag-counts.csv',
    __DIR__ . '/output/perplexity-tag-counts.csv'
 ];

 foreach ($outputFiles as $file) {
    if (file_exists($file)) {
        unlink($file);
        echo "🗑️  Deleted existing file: " . basename($file) . "\n";
    }
 }

 echo "\n📊 " . count($articles) . " articles to process\n\n";

 // Process articles in batches
 $batches = array_chunk($articles, BATCH_SIZE);
 $totalBatches = count($batches);
 $processedCount = 0;

 echo "Starting batch processing ({$totalBatches} batches of " . BATCH_SIZE . " articles)...\n\n";

 foreach ($batches as $batchNum => $batch) {
    $batchIndex = $batchNum + 1;
    echo "=== Batch {$batchIndex}/{$totalBatches} ===\n";

    $chatgptResults = null;
    $perplexityResults = null;

    // Process with ChatGPT (if enabled)
    if (USE_CHATGPT) {
        echo "🤖 Calling ChatGPT API...\n";
        $chatgptResults = processWithChatGPT($batch, $tagDefinitions, $tokenUsage);
    } else {
        echo "⏭️  ChatGPT disabled - skipping\n";
    }

    // Process with Perplexity (if enabled)
    if (USE_PERPLEXITY) {
        echo "🔍 Calling Perplexity API...\n";
        $perplexityResults = processWithPerplexity($batch, $tagDefinitions, $tokenUsage);
    } else {
        echo "⏭️  Perplexity disabled - skipping\n";
    }

    // Combine results and write to CSV
    $combinedResults = combineResults($batch, $chatgptResults, $perplexityResults);
    appendToOutputCSV($combinedResults);

    $processedCount += count($batch);
    echo "✅ Batch complete ({$processedCount}/{$remainingCount} articles processed)\n\n";

    // Small delay between batches to avoid rate limiting
    if ($batchIndex < $totalBatches) {
        sleep(1);
    }
 }

 // Generate tag count summaries
 echo "\n📊 Generating tag count summaries...\n";
 generateTagCounts();

 // Calculate costs
 $chatgptCost = ($tokenUsage['chatgpt_input'] / 1000000 * OPENAI_INPUT_PRICE) +
               ($tokenUsage['chatgpt_output'] / 1000000 * OPENAI_OUTPUT_PRICE);
 $perplexityCost = ($tokenUsage['perplexity_input'] / 1000000 * PERPLEXITY_INPUT_PRICE) +
                  ($tokenUsage['perplexity_output'] / 1000000 * PERPLEXITY_OUTPUT_PRICE);
 $totalCost = $chatgptCost + $perplexityCost;

 echo "\n========================================\n";
 echo "✅ Processing complete!\n";
 echo "========================================\n";
 echo "📄 Results: " . OUTPUT_CSV . "\n";
 if (USE_CHATGPT) {
    echo "📊 ChatGPT counts: " . __DIR__ . '/output/chatgpt-tag-counts.csv' . "\n";
 }
 if (USE_PERPLEXITY) {
    echo "📊 Perplexity counts: " . __DIR__ . '/output/perplexity-tag-counts.csv' . "\n";
 }
 echo "\n";
 echo "💰 API Cost Estimate:\n";
 if (USE_CHATGPT) {
    echo "   ChatGPT: $" . number_format($chatgptCost, 4) . " ";
    echo "(" . number_format($tokenUsage['chatgpt_input']) . " in / ";
    echo number_format($tokenUsage['chatgpt_output']) . " out tokens)\n";
 }
 if (USE_PERPLEXITY) {
    echo "   Perplexity: $" . number_format($perplexityCost, 4) . " ";
    echo "(" . number_format($tokenUsage['perplexity_input']) . " in / ";
    echo number_format($tokenUsage['perplexity_output']) . " out tokens)\n";
 }
 if (USE_CHATGPT || USE_PERPLEXITY) {
    echo "   Total: $" . number_format($totalCost, 4) . "\n";
 }
 echo "========================================\n";

 // =========================================
 // FUNCTIONS
 // =========================================

 /**
 * Validate configuration
 */
 function validateConfig() {
    $errors = [];

    // Check that at least one LLM is enabled
    if (!USE_CHATGPT && !USE_PERPLEXITY) {
        $errors[] = "At least one LLM must be enabled (USE_CHATGPT or USE_PERPLEXITY)";
    }

    // Only validate API keys for enabled LLMs
    if (USE_CHATGPT && OPENAI_API_KEY === 'your-openai-api-key-here') {
        $errors[] = "OpenAI API key not configured (required when USE_CHATGPT is true)";
    }

    if (USE_PERPLEXITY && PERPLEXITY_API_KEY === 'your-perplexity-api-key-here') {
        $errors[] = "Perplexity API key not configured (required when USE_PERPLEXITY is true)";
    }

    if (!file_exists(TAG_DEFINITIONS)) {
        $errors[] = "Tag definitions file not found: " . TAG_DEFINITIONS;
    }

    // Create output directory if it doesn't exist
    $outputDir = dirname(OUTPUT_CSV);
    if (!is_dir($outputDir)) {
        mkdir($outputDir, 0755, true);
    }

    if (!empty($errors)) {
        echo "❌ Configuration errors:\n";
        foreach ($errors as $error) {
            echo "   - {$error}\n";
        }
        return false;
    }

    return true;
 }

 /**
 * Load tag definitions from JSON file
 */
 function loadTagDefinitions() {
    $json = file_get_contents(TAG_DEFINITIONS);
    return json_decode($json, true);
 }

 /**
 * Load articles from WordPress database
 */
 function loadArticlesFromDatabase() {
    echo "🔍 Querying WordPress database...\n";

    $articles = [];
    $posts_per_page = 100;

    // Card issuers to match
    $card_issuers = [
        'Chase', 'American Express', 'Amex', 'Citi', 'Citibank',
        'Bank of America', 'Capital One', 'Wells Fargo', 'Discover',
        'Barclays', 'U.S. Bank', 'PNC', 'TD Bank', 'USAA',
        'Navy Federal', 'Synchrony', 'Apple', 'Goldman Sachs'
    ];

    // Offer features patterns
    $offer_features_patterns = [
        'no_annual_fee' => ['no annual fee', '$0 annual fee', 'no yearly fee', 'zero annual fee'],
        '0_intro_apr_purchases' => ['0% intro apr on purchases', '0% apr on purchases', 'intro apr purchases'],
        '0_intro_apr_bt' => ['0% intro apr on balance transfers', '0% balance transfer', 'intro apr balance transfer'],
        'welcome_bonus' => ['welcome bonus', 'sign-up bonus', 'signup bonus', 'intro bonus'],
        'cash_back' => ['cash back', 'cashback'],
        'points' => ['points', 'reward points'],
        'miles' => ['miles', 'airline miles', 'travel miles'],
        'secured' => ['secured card', 'secured credit card'],
        'student' => ['student card', 'student credit card'],
        'business' => ['business card', 'business credit card'],
        'prequal' => ['prequalify', 'pre-qualify', 'prequalification', 'check eligibility'],
        'instant_number' => ['instant card number', 'instant approval', 'use instantly']
    ];

    // Count total posts
    $post_types = ['post'];
    $total_count = 0;
    foreach ($post_types as $post_type) {
        $post_type_count = wp_count_posts($post_type);
        foreach ($post_type_count as $count) {
            $total_count += $count;
        }
    }

    $number_of_pages = ceil($total_count / $posts_per_page);

    // Query posts in batches
    for ($paged = 1; $paged <= $number_of_pages; $paged++) {
        $query_args = [
            'posts_per_page' => $posts_per_page,
            'paged' => $paged,
            'order' => 'ASC',
            'orderby' => 'date',
            'post_type' => $post_types,
            'post_status' => 'publish',
            'update_post_term_cache' => true,
        ];

        $query = new WP_Query($query_args);

        while ($query->have_posts()) {
            $query->the_post();
            $post_id = get_the_ID();

            // URL
            $url = get_permalink();

            // Title
            $title = do_shortcode(get_the_title());

            // H1 (if different)
            $h1 = '';
            if (function_exists('get_full_review_content_data')) {
                $full_review_content_data = get_full_review_content_data($post_id);
                if ($full_review_content_data && $full_review_content_data['enabled'] && $full_review_content_data['id']) {
                    $h1_title = do_shortcode(get_the_title($full_review_content_data['id']));
                    if ($h1_title !== $title) {
                        $h1 = $h1_title;
                    }
                }
            }

            // Section / Category
            $section_category = '';
            $categories = get_the_category($post_id);
            if ($categories && !is_wp_error($categories)) {
                $primary_cat = $categories[0];
                $section_category = $primary_cat->name;
            }

            // Meta Description
            $meta_description = get_post_meta($post_id, '_yoast_wpseo_metadesc', true);

            // Get raw content for parsing
            $raw_content = get_the_content(null, false, $post_id);

            // Full Article Content (cleaned for LLM, with word limit)
            $clean_content = strip_shortcodes($raw_content);
            $clean_content = wp_strip_all_tags($clean_content);
            $clean_content = preg_replace('/\s+/', ' ', $clean_content);
            $clean_content = trim($clean_content);

            // Limit to MAX_ARTICLE_WORDS to prevent token limit issues
            $words = explode(' ', $clean_content);
            if (count($words) > MAX_ARTICLE_WORDS) {
                $words = array_slice($words, 0, MAX_ARTICLE_WORDS);
                $full_article_content = implode(' ', $words) . ' [Content truncated - article exceeds ' . MAX_ARTICLE_WORDS . ' words]';
            } else {
                $full_article_content = $clean_content;
            }

            // Card Issuer Mentions
            $issuer_mentions = [];
            foreach ($card_issuers as $issuer) {
                if (stripos($raw_content, $issuer) !== false) {
                    $normalized_issuer = ($issuer === 'Amex') ? 'American Express' : $issuer;
                    if (!in_array($normalized_issuer, $issuer_mentions)) {
                        $issuer_mentions[] = $normalized_issuer;
                    }
                }
            }
            $card_issuer_mentions = implode(', ', $issuer_mentions);

            // Card Product Mentions
            $card_mentions = [];
            if (preg_match_all('/\[(?:offer-jump|jump_link|card_field)[^\]]*id=["\']?(\d+)["\']?[^\]]*\]/i', $raw_content, $matches)) {
                foreach ($matches[1] as $card_id) {
                    $card_title = get_the_title($card_id);
                    if ($card_title && !in_array($card_title, $card_mentions)) {
                        $card_mentions[] = $card_title;
                    }
                }
            }
            $card_product_mentions = implode(', ', $card_mentions);

            // Offer Features
            $detected_features = [];
            $content_lower = strtolower($raw_content);
            foreach ($offer_features_patterns as $feature_key => $patterns) {
                foreach ($patterns as $pattern) {
                    if (stripos($content_lower, strtolower($pattern)) !== false) {
                        $detected_features[] = $feature_key;
                        break;
                    }
                }
            }
            $offer_features = implode(', ', $detected_features);

            // Existing Tags
            $existing_tags = '';
            $tags = get_the_tags($post_id);
            if ($tags && !is_wp_error($tags)) {
                $tag_names = [];
                foreach ($tags as $tag) {
                    $tag_names[] = $tag->name;
                }
                $existing_tags = implode(', ', $tag_names);
            }

            // Top Keywords
            $top_keywords = get_post_meta($post_id, '_yoast_wpseo_focuskw', true);

            $articles[] = [
                'post_id' => $post_id,
                'url' => $url,
                'title' => $title,
                'h1_if_different' => $h1,
                'section_category' => $section_category,
                'meta_description' => $meta_description,
                'full_article_content' => $full_article_content,
                'card_issuer_mentions' => $card_issuer_mentions,
                'card_product_mentions' => $card_product_mentions,
                'offer_features' => $offer_features,
                'existing_tags' => $existing_tags,
                'top_keywords' => $top_keywords,
            ];
        }

        wp_reset_postdata();
        wp_reset_query();
        wp_cache_flush();
    }

    return $articles;
 }

 /**
 * Process batch with ChatGPT
 */
 function processWithChatGPT($batch, $tagDefinitions, &$tokenUsage) {
    $prompt = buildPrompt($batch, $tagDefinitions);
    $schema = buildResponseSchema(count($batch));

    $requestData = [
        'model' => OPENAI_MODEL,
        'messages' => [
            [
                'role' => 'system',
                'content' => 'You are an expert content taxonomist specializing in credit card and personal finance content. Your task is to accurately categorize articles based on their content and assign appropriate tags.'
            ],
            [
                'role' => 'user',
                'content' => $prompt
            ]
        ],
        'response_format' => [
            'type' => 'json_schema',
            'json_schema' => [
                'name' => 'article_tagging_response',
                'strict' => true,
                'schema' => $schema
            ]
        ],
        'temperature' => 0.3,
        'max_completion_tokens' => 1000  // Reduced from 4096 - we only need tags back
    ];

    $result = makeAPICall(
        OPENAI_ENDPOINT,
        OPENAI_API_KEY,
        $requestData,
        'ChatGPT',
        $batch[0]['url'] ?? 'unknown'  // Pass URL for error logging
    );

    if ($result === false) {
        // Return error results for all articles in batch
        return array_fill(0, count($batch), [
            'tag' => 'MANUAL_REVIEW',
            'subtag' => '',
            'error' => true
        ]);
    }

    // Extract token usage from response
    if (isset($result['usage'])) {
        $tokenUsage['chatgpt_input'] += $result['usage']['prompt_tokens'] ?? 0;
        $tokenUsage['chatgpt_output'] += $result['usage']['completion_tokens'] ?? 0;
    }

    return parseAPIResponse($result, count($batch));
 }

 /**
 * Process batch with Perplexity
 */
 function processWithPerplexity($batch, $tagDefinitions, &$tokenUsage) {
    $prompt = buildPrompt($batch, $tagDefinitions);
    $schema = buildResponseSchema(count($batch));

    $requestData = [
        'model' => PERPLEXITY_MODEL,
        'messages' => [
            [
                'role' => 'system',
                'content' => 'You are an expert content taxonomist specializing in credit card and personal finance content. Your task is to accurately categorize articles based on their content and assign appropriate tags.'
            ],
            [
                'role' => 'user',
                'content' => $prompt
            ]
        ],
        'response_format' => [
            'type' => 'json_schema',
            'json_schema' => [
                'strict' => true,
                'schema' => $schema
            ]
        ],
        'temperature' => 0.3,
        'max_tokens' => 1000  // Reduced - we only need tags back
    ];

    $result = makeAPICall(
        PERPLEXITY_ENDPOINT,
        PERPLEXITY_API_KEY,
        $requestData,
        'Perplexity',
        $batch[0]['url'] ?? 'unknown'  // Pass URL for error logging
    );

    if ($result === false) {
        // Return error results for all articles in batch
        return array_fill(0, count($batch), [
            'tag' => 'MANUAL_REVIEW',
            'subtag' => '',
            'error' => true
        ]);
    }

    // Extract token usage from response
    if (isset($result['usage'])) {
        $tokenUsage['perplexity_input'] += $result['usage']['prompt_tokens'] ?? 0;
        $tokenUsage['perplexity_output'] += $result['usage']['completion_tokens'] ?? 0;
    }

    return parseAPIResponse($result, count($batch));
 }

 /**
 * Build prompt for API call
 */
 function buildPrompt($batch, $tagDefinitions) {
    $prompt = "Please analyze the following articles and assign appropriate tags based on the tag definitions provided.\n\n";

    // Add tag definitions
    $prompt .= "=== MAIN TAG DEFINITIONS ===\n\n";
    foreach ($tagDefinitions['main_tags'] as $tagName => $tagInfo) {
        $prompt .= "**{$tagName}**\n";
        $prompt .= "{$tagInfo['definition']}\n\n";
    }

    $prompt .= "\n=== SUBTAG DEFINITIONS ===\n\n";
    foreach ($tagDefinitions['subtags'] as $mainTag => $subtags) {
        $prompt .= "**{$mainTag}** subtags:\n";
        foreach ($subtags as $subtagName => $subtagInfo) {
            $prompt .= "  - {$subtagName}: {$subtagInfo['definition']}\n";
        }
        $prompt .= "\n";
    }

    $prompt .= "\n=== TAGGING RULES ===\n\n";
    $prompt .= "1. Each article MUST be assigned exactly ONE main tag\n";
    $prompt .= "2. Each article MAY be assigned ONE subtag (or none) based on relevance and the article's main tag. For example, an article about a credit card that offers everyday purchases cash back may get the subtag 'Everyday Purchases' if the main tag is 'Cash Back Credit Cards'.\n";
    $prompt .= "3. **CRITICAL CONSTRAINT - SUBTAG VALIDATION**: \n";
    $prompt .= "   - Subtags can ONLY be assigned if they belong to the article's assigned main tag\n";
    $prompt .= "   - NEVER assign a subtag from a different main tag category\n";
    $prompt .= "   - Before assigning a subtag, verify: Does this subtag appear in the subtag list for my chosen main tag?\n";
    $prompt .= "   - If the answer is NO, do not assign that subtag. Leave it empty instead.\n";
    $prompt .= "   - Example: 'Everyday Purchases' is ONLY valid for 'Cash Back Credit Cards', not for any other main tag\n";
    $prompt .= "4. **CRITICAL WEIGHTING**: The article TITLE should carry significantly more weight than the article content when determining tag relevance\n";
    $prompt .= "   - Title = 60% importance (1.5x the weight of content)\n";
    $prompt .= "   - Full Article Content = 40% importance\n";
    $prompt .= "5. First, determine what the title indicates the article is about\n";
    $prompt .= "6. Then, read the full article content to confirm and refine your understanding\n";
    $prompt .= "7. If the title and content suggest different tags, prioritize the title\n";
    $prompt .= "8. Use meta description and section category as minor supporting signals only\n\n";
    $prompt .= "**REMINDER**: Before finalizing your response, verify that any assigned subtag belongs to the article's assigned main tag. Cross-check the subtag definitions above.\n\n";

    $prompt .= "=== ARTICLES TO TAG ===\n\n";

    foreach ($batch as $index => $article) {
        $articleNum = $index + 1;
        $prompt .= "Article #{$articleNum}:\n";
        $prompt .= "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n";
        $prompt .= "**TITLE (60% WEIGHT)**: {$article['title']}\n";
        $prompt .= "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n";

        $prompt .= "**FULL ARTICLE CONTENT (40% WEIGHT)**:\n";
        if (!empty($article['full_article_content'])) {
            $prompt .= $article['full_article_content'] . "\n\n";
        }

        $prompt .= "**SUPPORTING METADATA**:\n";
        $prompt .= "URL: {$article['url']}\n";
        if (!empty($article['meta_description'])) {
            $prompt .= "Meta Description: {$article['meta_description']}\n";
        }
        if (!empty($article['section_category'])) {
            $prompt .= "Section: {$article['section_category']}\n";
        }

        $prompt .= "\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n";
    }

    $prompt .= "REMINDER: The article TITLE should be your PRIMARY signal (60% weight). Use the full article content to confirm and understand context (40% weight), but if there's any conflict, the title takes precedence.";

    return $prompt;
 }

 /**
 * Build JSON schema for API response
 */
 function buildResponseSchema($articleCount) {
    $properties = [];

    for ($i = 1; $i <= $articleCount; $i++) {
        $properties["article_{$i}"] = [
            'type' => 'object',
            'properties' => [
                'main_tag' => [
                    'type' => 'string',
                    'description' => 'The main tag assigned to this article'
                ],
                'subtag' => [
                    'type' => 'string',
                    'description' => 'The subtag assigned to this article (empty string if none)'
                ]
            ],
            'required' => ['main_tag', 'subtag'],
            'additionalProperties' => false
        ];
    }

    return [
        'type' => 'object',
        'properties' => $properties,
        'required' => array_keys($properties),
        'additionalProperties' => false
    ];
 }

 /**
 * Make API call with exponential backoff retry
 */
 function makeAPICall($endpoint, $apiKey, $requestData, $providerName, $articleUrl = 'unknown') {
    $headers = [
        'Authorization: Bearer ' . $apiKey,
        'Content-Type: application/json'
    ];

    $attempt = 0;

    while ($attempt < MAX_RETRIES) {
        $attempt++;

        $ch = curl_init($endpoint);
        curl_setopt($ch, CURLOPT_POST, true);
        curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($requestData));
        curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($ch, CURLOPT_TIMEOUT, 300);  // Increased from 120 to 300 seconds for longer articles

        $response = curl_exec($ch);
        $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
        $error = curl_error($ch);
        curl_close($ch);

        if ($error) {
            echo "   ⚠️  {$providerName} API error (attempt {$attempt}) for {$articleUrl}: {$error}\n";
        } elseif ($httpCode === 200) {
            return json_decode($response, true);
        } elseif ($httpCode === 429) {
            // Rate limit - wait longer
            $waitTime = pow(2, $attempt) * 5;
            echo "   ⚠️  {$providerName} rate limit hit (attempt {$attempt}) for {$articleUrl}. Waiting {$waitTime}s...\n";
            sleep($waitTime);
            continue;
        } elseif ($httpCode === 413) {
            // Payload too large
            echo "   ⚠️  {$providerName} payload too large (attempt {$attempt}) for {$articleUrl} - article content may be too long\n";
            echo "      Try reducing MAX_ARTICLE_WORDS in the script\n";
            return false;  // Don't retry payload too large errors
        } else {
            echo "   ⚠️  {$providerName} API returned status {$httpCode} (attempt {$attempt}) for {$articleUrl}\n";
            $errorData = json_decode($response, true);
            if ($errorData && isset($errorData['error']['message'])) {
                echo "      Error: {$errorData['error']['message']}\n";
            } elseif ($response) {
                echo "      Response: " . substr($response, 0, 500) . "\n";
            }
        }

        // Exponential backoff for retries
        if ($attempt < MAX_RETRIES) {
            $waitTime = pow(2, $attempt);
            echo "   ⏳ Retrying in {$waitTime}s...\n";
            sleep($waitTime);
        }
    }

    echo "   ❌ {$providerName} API failed after " . MAX_RETRIES . " attempts for {$articleUrl}\n";
    return false;
 }

 /**
 * Parse API response
 */
 function parseAPIResponse($apiResponse, $expectedCount) {
    if (!isset($apiResponse['choices'][0]['message']['content'])) {
        return array_fill(0, $expectedCount, [
            'tag' => 'MANUAL_REVIEW',
            'subtag' => '',
            'error' => true
        ]);
    }

    $content = json_decode($apiResponse['choices'][0]['message']['content'], true);

    if (!$content) {
        return array_fill(0, $expectedCount, [
            'tag' => 'MANUAL_REVIEW',
            'subtag' => '',
            'error' => true
        ]);
    }

    $results = [];
    for ($i = 1; $i <= $expectedCount; $i++) {
        $key = "article_{$i}";
        if (isset($content[$key])) {
            $results[] = [
                'tag' => $content[$key]['main_tag'] ?? 'MANUAL_REVIEW',
                'subtag' => $content[$key]['subtag'] ?? '',
                'error' => false
            ];
        } else {
            $results[] = [
                'tag' => 'MANUAL_REVIEW',
                'subtag' => '',
                'error' => true
            ];
        }
    }

    return $results;
 }

 /**
 * Combine results from both APIs
 */
 function combineResults($batch, $chatgptResults, $perplexityResults) {
    $combined = [];

    foreach ($batch as $index => $article) {
        // Start with base columns
        $row = [
            'post_id' => $article['post_id'],
            'url' => $article['url'],
            'title' => $article['title']
        ];

        // Add ChatGPT columns if enabled
        if (USE_CHATGPT) {
            $chatgpt = $chatgptResults ? $chatgptResults[$index] : ['tag' => 'N/A', 'subtag' => ''];
            $row['chatgpt_tag'] = $chatgpt['tag'];
            $row['chatgpt_subtag'] = $chatgpt['subtag'];
        }

        // Add Perplexity columns if enabled
        if (USE_PERPLEXITY) {
            $perplexity = $perplexityResults ? $perplexityResults[$index] : ['tag' => 'N/A', 'subtag' => ''];
            $row['perplexity_tag'] = $perplexity['tag'];
            $row['perplexity_subtag'] = $perplexity['subtag'];
        }

        // Add tags_match column only if both LLMs are enabled
        if (USE_CHATGPT && USE_PERPLEXITY) {
            $chatgpt = $chatgptResults ? $chatgptResults[$index] : ['tag' => 'N/A', 'subtag' => ''];
            $perplexity = $perplexityResults ? $perplexityResults[$index] : ['tag' => 'N/A', 'subtag' => ''];

            $tagsMatch = ($chatgpt['tag'] === $perplexity['tag'] &&
                          $chatgpt['subtag'] === $perplexity['subtag']) ? 'yes' : 'no';
            $row['tags_match'] = $tagsMatch;
        }

        $combined[] = $row;
    }

    return $combined;
 }

 /**
 * Append results to output CSV
 */
 function appendToOutputCSV($results) {
    $fileExists = file_exists(OUTPUT_CSV);
    $file = fopen(OUTPUT_CSV, 'a');

    if (!$file) {
        echo "   ❌ Failed to open output file\n";
        return;
    }

    // Write header if file is new
    if (!$fileExists) {
        // Build header dynamically based on enabled LLMs
        $header = ['post_id', 'url', 'title'];

        if (USE_CHATGPT) {
            $header[] = 'chatgpt_tag';
            $header[] = 'chatgpt_subtag';
        }

        if (USE_PERPLEXITY) {
            $header[] = 'perplexity_tag';
            $header[] = 'perplexity_subtag';
        }

        // Only include tags_match if both LLMs are enabled
        if (USE_CHATGPT && USE_PERPLEXITY) {
            $header[] = 'tags_match';
        }

        fputcsv($file, $header);
    }

    // Write results
    foreach ($results as $result) {
        fputcsv($file, $result);
    }

    fclose($file);
 }

 /**
 * Generate tag count summaries from the output CSV
 */
 function generateTagCounts() {
    if (!file_exists(OUTPUT_CSV)) {
        echo "   ⚠️  No output CSV found to generate counts\n";
        return;
    }

    // Read all results from output CSV
    $file = fopen(OUTPUT_CSV, 'r');
    if (!$file) {
        echo "   ❌ Failed to open output CSV\n";
        return;
    }

    // Read header to determine column positions
    $header = fgetcsv($file);
    if (!$header) {
        echo "   ❌ Failed to read CSV header\n";
        fclose($file);
        return;
    }

    // Find column indices dynamically
    $columnMap = array_flip($header);
    $chatgptTagIdx = $columnMap['chatgpt_tag'] ?? null;
    $chatgptSubtagIdx = $columnMap['chatgpt_subtag'] ?? null;
    $perplexityTagIdx = $columnMap['perplexity_tag'] ?? null;
    $perplexitySubtagIdx = $columnMap['perplexity_subtag'] ?? null;

    // Initialize counters
    $chatgptCounts = [];
    $perplexityCounts = [];

    // Read and count
    while (($row = fgetcsv($file)) !== false) {
        // Count ChatGPT tags (only if ChatGPT columns exist)
        if ($chatgptTagIdx !== null && USE_CHATGPT) {
            $chatgptTag = $row[$chatgptTagIdx] ?? '';
            $chatgptSubtag = $chatgptSubtagIdx !== null ? ($row[$chatgptSubtagIdx] ?? '') : '';

            if (!empty($chatgptTag) && $chatgptTag !== 'MANUAL_REVIEW' && $chatgptTag !== 'N/A') {
                // Main tag count
                $mainKey = $chatgptTag . '||';
                if (!isset($chatgptCounts[$mainKey])) {
                    $chatgptCounts[$mainKey] = ['tag' => $chatgptTag, 'subtag' => '', 'count' => 0];
                }
                $chatgptCounts[$mainKey]['count']++;

                // Subtag count
                if (!empty($chatgptSubtag)) {
                    $subKey = $chatgptTag . '||' . $chatgptSubtag;
                    if (!isset($chatgptCounts[$subKey])) {
                        $chatgptCounts[$subKey] = ['tag' => $chatgptTag, 'subtag' => $chatgptSubtag, 'count' => 0];
                    }
                    $chatgptCounts[$subKey]['count']++;
                }
            }
        }

        // Count Perplexity tags (only if Perplexity columns exist)
        if ($perplexityTagIdx !== null && USE_PERPLEXITY) {
            $perplexityTag = $row[$perplexityTagIdx] ?? '';
            $perplexitySubtag = $perplexitySubtagIdx !== null ? ($row[$perplexitySubtagIdx] ?? '') : '';

            if (!empty($perplexityTag) && $perplexityTag !== 'MANUAL_REVIEW' && $perplexityTag !== 'N/A') {
                // Main tag count
                $mainKey = $perplexityTag . '||';
                if (!isset($perplexityCounts[$mainKey])) {
                    $perplexityCounts[$mainKey] = ['tag' => $perplexityTag, 'subtag' => '', 'count' => 0];
                }
                $perplexityCounts[$mainKey]['count']++;

                // Subtag count
                if (!empty($perplexitySubtag)) {
                    $subKey = $perplexityTag . '||' . $perplexitySubtag;
                    if (!isset($perplexityCounts[$subKey])) {
                        $perplexityCounts[$subKey] = ['tag' => $perplexityTag, 'subtag' => $perplexitySubtag, 'count' => 0];
                    }
                    $perplexityCounts[$subKey]['count']++;
                }
            }
        }
    }

    fclose($file);

    // Sort counts by tag name, then by subtag
    usort($chatgptCounts, function($a, $b) {
        if ($a['tag'] !== $b['tag']) {
            return strcmp($a['tag'], $b['tag']);
        }
        return strcmp($a['subtag'], $b['subtag']);
    });

    usort($perplexityCounts, function($a, $b) {
        if ($a['tag'] !== $b['tag']) {
            return strcmp($a['tag'], $b['tag']);
        }
        return strcmp($a['subtag'], $b['subtag']);
    });

    // Write ChatGPT counts (only if ChatGPT is enabled)
    if (USE_CHATGPT && !empty($chatgptCounts)) {
        $chatgptCountsFile = __DIR__ . '/output/chatgpt-tag-counts.csv';
        $file = fopen($chatgptCountsFile, 'w');
        if ($file) {
            fputcsv($file, ['Main Tag', 'Subtag', 'Article Count']);
            foreach ($chatgptCounts as $count) {
                fputcsv($file, [$count['tag'], $count['subtag'], $count['count']]);
            }
            fclose($file);
            echo "   ✅ ChatGPT counts saved\n";
        }
    } elseif (USE_CHATGPT) {
        echo "   ⚠️  No ChatGPT tags to count\n";
    } else {
        echo "   ⏭️  ChatGPT disabled - no counts to generate\n";
    }

    // Write Perplexity counts (only if Perplexity is enabled)
    if (USE_PERPLEXITY && !empty($perplexityCounts)) {
        $perplexityCountsFile = __DIR__ . '/output/perplexity-tag-counts.csv';
        $file = fopen($perplexityCountsFile, 'w');
        if ($file) {
            fputcsv($file, ['Main Tag', 'Subtag', 'Article Count']);
            foreach ($perplexityCounts as $count) {
                fputcsv($file, [$count['tag'], $count['subtag'], $count['count']]);
            }
            fclose($file);
            echo "   ✅ Perplexity counts saved\n";
        }
    } elseif (USE_PERPLEXITY) {
        echo "   ⚠️  No Perplexity tags to count\n";
    } else {
        echo "   ⏭️  Perplexity disabled - no counts to generate\n";
    }
 }
No results found