Skip to content

Instantly share code, notes, and snippets.

@db-pj
Created December 1, 2025 20:32
Show Gist options
  • Select an option

  • Save db-pj/3b7616896f853045cb2a6ec8bb23dbd8 to your computer and use it in GitHub Desktop.

Select an option

Save db-pj/3b7616896f853045cb2a6ec8bb23dbd8 to your computer and use it in GitHub Desktop.
<?php
/**
* Article Tag Assignment Script
* Uses ChatGPT and Perplexity APIs to assign tags and subtags to articles
*
* Usage: php assign-tags.php
*/
// =========================================
// CONFIGURATION
// =========================================
// API Models
const OPENAI_MODEL = 'gpt-4o';
const PERPLEXITY_MODEL = 'sonar-pro';
// API Pricing (per 1M tokens)
const OPENAI_INPUT_PRICE = 2.50; // $2.50 per 1M input tokens (gpt-4o)
const OPENAI_OUTPUT_PRICE = 10.00; // $10.00 per 1M output tokens (gpt-4o)
const PERPLEXITY_INPUT_PRICE = 1.00; // $1.00 per 1M input tokens (sonar-pro)
const PERPLEXITY_OUTPUT_PRICE = 5.00; // $5.00 per 1M output tokens (sonar-pro)
// LLM Selection - Enable/disable which LLMs to use
const USE_CHATGPT = true; // Run ChatGPT tagging
const USE_PERPLEXITY = false; // Run Perplexity tagging
// Processing Configuration
const BATCH_SIZE = 1; // Number of articles to process per API call (reduced to 1 for full article content)
const DEV_MODE = false; // When true, only process first 100 articles
const MAX_RETRIES = 3; // Maximum retry attempts for failed API calls
const MAX_ARTICLE_WORDS = 10000; // Maximum words from article content (to prevent token limit issues)
// File Paths
const OUTPUT_CSV = __DIR__ . '/output/article-tags.csv';
const TAG_DEFINITIONS = __DIR__ . '/tag-definitions.json';
// WordPress Bootstrap
$public_html = getenv('PUBLIC_HTML');
if (!$public_html || !file_exists($public_html . '/wp-load.php')) {
echo "❌ Error: PUBLIC_HTML environment variable not set or wp-load.php not found\n";
exit(1);
}
require_once($public_html . '/wp-load.php');
// API Endpoints
const OPENAI_ENDPOINT = 'https://api.openai.com/v1/chat/completions';
const PERPLEXITY_ENDPOINT = 'https://api.perplexity.ai/chat/completions';
// =========================================
// MAIN SCRIPT
// =========================================
echo "========================================\n";
echo "Article Tag Assignment Script\n";
echo "========================================\n\n";
// Initialize cost tracking
$tokenUsage = [
'chatgpt_input' => 0,
'chatgpt_output' => 0,
'perplexity_input' => 0,
'perplexity_output' => 0,
];
// Validate configuration
if (!validateConfig()) {
exit(1);
}
// Load tag definitions
$tagDefinitions = loadTagDefinitions();
if (!$tagDefinitions) {
echo "❌ Failed to load tag definitions\n";
exit(1);
}
echo "✅ Loaded " . count($tagDefinitions['main_tags']) . " main tags\n";
echo "✅ Loaded subtag definitions\n\n";
// Load articles from database
$articles = loadArticlesFromDatabase();
if (!$articles) {
echo "❌ Failed to load articles from database\n";
exit(1);
}
$totalArticles = count($articles);
echo "✅ Loaded {$totalArticles} articles from database\n";
// Apply DEV mode limit
if (DEV_MODE) {
$articles = array_slice($articles, 0, 50);
echo "🔧 DEV MODE: Limited to " . count($articles) . " articles\n";
}
// Delete existing output files if they exist (fresh start)
$outputFiles = [
OUTPUT_CSV,
__DIR__ . '/output/chatgpt-tag-counts.csv',
__DIR__ . '/output/perplexity-tag-counts.csv'
];
foreach ($outputFiles as $file) {
if (file_exists($file)) {
unlink($file);
echo "🗑️ Deleted existing file: " . basename($file) . "\n";
}
}
echo "\n📊 " . count($articles) . " articles to process\n\n";
// Process articles in batches
$batches = array_chunk($articles, BATCH_SIZE);
$totalBatches = count($batches);
$processedCount = 0;
echo "Starting batch processing ({$totalBatches} batches of " . BATCH_SIZE . " articles)...\n\n";
foreach ($batches as $batchNum => $batch) {
$batchIndex = $batchNum + 1;
echo "=== Batch {$batchIndex}/{$totalBatches} ===\n";
$chatgptResults = null;
$perplexityResults = null;
// Process with ChatGPT (if enabled)
if (USE_CHATGPT) {
echo "🤖 Calling ChatGPT API...\n";
$chatgptResults = processWithChatGPT($batch, $tagDefinitions, $tokenUsage);
} else {
echo "⏭️ ChatGPT disabled - skipping\n";
}
// Process with Perplexity (if enabled)
if (USE_PERPLEXITY) {
echo "🔍 Calling Perplexity API...\n";
$perplexityResults = processWithPerplexity($batch, $tagDefinitions, $tokenUsage);
} else {
echo "⏭️ Perplexity disabled - skipping\n";
}
// Combine results and write to CSV
$combinedResults = combineResults($batch, $chatgptResults, $perplexityResults);
appendToOutputCSV($combinedResults);
$processedCount += count($batch);
echo "✅ Batch complete ({$processedCount}/{$remainingCount} articles processed)\n\n";
// Small delay between batches to avoid rate limiting
if ($batchIndex < $totalBatches) {
sleep(1);
}
}
// Generate tag count summaries
echo "\n📊 Generating tag count summaries...\n";
generateTagCounts();
// Calculate costs
$chatgptCost = ($tokenUsage['chatgpt_input'] / 1000000 * OPENAI_INPUT_PRICE) +
($tokenUsage['chatgpt_output'] / 1000000 * OPENAI_OUTPUT_PRICE);
$perplexityCost = ($tokenUsage['perplexity_input'] / 1000000 * PERPLEXITY_INPUT_PRICE) +
($tokenUsage['perplexity_output'] / 1000000 * PERPLEXITY_OUTPUT_PRICE);
$totalCost = $chatgptCost + $perplexityCost;
echo "\n========================================\n";
echo "✅ Processing complete!\n";
echo "========================================\n";
echo "📄 Results: " . OUTPUT_CSV . "\n";
if (USE_CHATGPT) {
echo "📊 ChatGPT counts: " . __DIR__ . '/output/chatgpt-tag-counts.csv' . "\n";
}
if (USE_PERPLEXITY) {
echo "📊 Perplexity counts: " . __DIR__ . '/output/perplexity-tag-counts.csv' . "\n";
}
echo "\n";
echo "💰 API Cost Estimate:\n";
if (USE_CHATGPT) {
echo " ChatGPT: $" . number_format($chatgptCost, 4) . " ";
echo "(" . number_format($tokenUsage['chatgpt_input']) . " in / ";
echo number_format($tokenUsage['chatgpt_output']) . " out tokens)\n";
}
if (USE_PERPLEXITY) {
echo " Perplexity: $" . number_format($perplexityCost, 4) . " ";
echo "(" . number_format($tokenUsage['perplexity_input']) . " in / ";
echo number_format($tokenUsage['perplexity_output']) . " out tokens)\n";
}
if (USE_CHATGPT || USE_PERPLEXITY) {
echo " Total: $" . number_format($totalCost, 4) . "\n";
}
echo "========================================\n";
// =========================================
// FUNCTIONS
// =========================================
/**
* Validate configuration
*/
function validateConfig() {
$errors = [];
// Check that at least one LLM is enabled
if (!USE_CHATGPT && !USE_PERPLEXITY) {
$errors[] = "At least one LLM must be enabled (USE_CHATGPT or USE_PERPLEXITY)";
}
// Only validate API keys for enabled LLMs
if (USE_CHATGPT && OPENAI_API_KEY === 'your-openai-api-key-here') {
$errors[] = "OpenAI API key not configured (required when USE_CHATGPT is true)";
}
if (USE_PERPLEXITY && PERPLEXITY_API_KEY === 'your-perplexity-api-key-here') {
$errors[] = "Perplexity API key not configured (required when USE_PERPLEXITY is true)";
}
if (!file_exists(TAG_DEFINITIONS)) {
$errors[] = "Tag definitions file not found: " . TAG_DEFINITIONS;
}
// Create output directory if it doesn't exist
$outputDir = dirname(OUTPUT_CSV);
if (!is_dir($outputDir)) {
mkdir($outputDir, 0755, true);
}
if (!empty($errors)) {
echo "❌ Configuration errors:\n";
foreach ($errors as $error) {
echo " - {$error}\n";
}
return false;
}
return true;
}
/**
* Load tag definitions from JSON file
*/
function loadTagDefinitions() {
$json = file_get_contents(TAG_DEFINITIONS);
return json_decode($json, true);
}
/**
* Load articles from WordPress database
*/
function loadArticlesFromDatabase() {
echo "🔍 Querying WordPress database...\n";
$articles = [];
$posts_per_page = 100;
// Card issuers to match
$card_issuers = [
'Chase', 'American Express', 'Amex', 'Citi', 'Citibank',
'Bank of America', 'Capital One', 'Wells Fargo', 'Discover',
'Barclays', 'U.S. Bank', 'PNC', 'TD Bank', 'USAA',
'Navy Federal', 'Synchrony', 'Apple', 'Goldman Sachs'
];
// Offer features patterns
$offer_features_patterns = [
'no_annual_fee' => ['no annual fee', '$0 annual fee', 'no yearly fee', 'zero annual fee'],
'0_intro_apr_purchases' => ['0% intro apr on purchases', '0% apr on purchases', 'intro apr purchases'],
'0_intro_apr_bt' => ['0% intro apr on balance transfers', '0% balance transfer', 'intro apr balance transfer'],
'welcome_bonus' => ['welcome bonus', 'sign-up bonus', 'signup bonus', 'intro bonus'],
'cash_back' => ['cash back', 'cashback'],
'points' => ['points', 'reward points'],
'miles' => ['miles', 'airline miles', 'travel miles'],
'secured' => ['secured card', 'secured credit card'],
'student' => ['student card', 'student credit card'],
'business' => ['business card', 'business credit card'],
'prequal' => ['prequalify', 'pre-qualify', 'prequalification', 'check eligibility'],
'instant_number' => ['instant card number', 'instant approval', 'use instantly']
];
// Count total posts
$post_types = ['post'];
$total_count = 0;
foreach ($post_types as $post_type) {
$post_type_count = wp_count_posts($post_type);
foreach ($post_type_count as $count) {
$total_count += $count;
}
}
$number_of_pages = ceil($total_count / $posts_per_page);
// Query posts in batches
for ($paged = 1; $paged <= $number_of_pages; $paged++) {
$query_args = [
'posts_per_page' => $posts_per_page,
'paged' => $paged,
'order' => 'ASC',
'orderby' => 'date',
'post_type' => $post_types,
'post_status' => 'publish',
'update_post_term_cache' => true,
];
$query = new WP_Query($query_args);
while ($query->have_posts()) {
$query->the_post();
$post_id = get_the_ID();
// URL
$url = get_permalink();
// Title
$title = do_shortcode(get_the_title());
// H1 (if different)
$h1 = '';
if (function_exists('get_full_review_content_data')) {
$full_review_content_data = get_full_review_content_data($post_id);
if ($full_review_content_data && $full_review_content_data['enabled'] && $full_review_content_data['id']) {
$h1_title = do_shortcode(get_the_title($full_review_content_data['id']));
if ($h1_title !== $title) {
$h1 = $h1_title;
}
}
}
// Section / Category
$section_category = '';
$categories = get_the_category($post_id);
if ($categories && !is_wp_error($categories)) {
$primary_cat = $categories[0];
$section_category = $primary_cat->name;
}
// Meta Description
$meta_description = get_post_meta($post_id, '_yoast_wpseo_metadesc', true);
// Get raw content for parsing
$raw_content = get_the_content(null, false, $post_id);
// Full Article Content (cleaned for LLM, with word limit)
$clean_content = strip_shortcodes($raw_content);
$clean_content = wp_strip_all_tags($clean_content);
$clean_content = preg_replace('/\s+/', ' ', $clean_content);
$clean_content = trim($clean_content);
// Limit to MAX_ARTICLE_WORDS to prevent token limit issues
$words = explode(' ', $clean_content);
if (count($words) > MAX_ARTICLE_WORDS) {
$words = array_slice($words, 0, MAX_ARTICLE_WORDS);
$full_article_content = implode(' ', $words) . ' [Content truncated - article exceeds ' . MAX_ARTICLE_WORDS . ' words]';
} else {
$full_article_content = $clean_content;
}
// Card Issuer Mentions
$issuer_mentions = [];
foreach ($card_issuers as $issuer) {
if (stripos($raw_content, $issuer) !== false) {
$normalized_issuer = ($issuer === 'Amex') ? 'American Express' : $issuer;
if (!in_array($normalized_issuer, $issuer_mentions)) {
$issuer_mentions[] = $normalized_issuer;
}
}
}
$card_issuer_mentions = implode(', ', $issuer_mentions);
// Card Product Mentions
$card_mentions = [];
if (preg_match_all('/\[(?:offer-jump|jump_link|card_field)[^\]]*id=["\']?(\d+)["\']?[^\]]*\]/i', $raw_content, $matches)) {
foreach ($matches[1] as $card_id) {
$card_title = get_the_title($card_id);
if ($card_title && !in_array($card_title, $card_mentions)) {
$card_mentions[] = $card_title;
}
}
}
$card_product_mentions = implode(', ', $card_mentions);
// Offer Features
$detected_features = [];
$content_lower = strtolower($raw_content);
foreach ($offer_features_patterns as $feature_key => $patterns) {
foreach ($patterns as $pattern) {
if (stripos($content_lower, strtolower($pattern)) !== false) {
$detected_features[] = $feature_key;
break;
}
}
}
$offer_features = implode(', ', $detected_features);
// Existing Tags
$existing_tags = '';
$tags = get_the_tags($post_id);
if ($tags && !is_wp_error($tags)) {
$tag_names = [];
foreach ($tags as $tag) {
$tag_names[] = $tag->name;
}
$existing_tags = implode(', ', $tag_names);
}
// Top Keywords
$top_keywords = get_post_meta($post_id, '_yoast_wpseo_focuskw', true);
$articles[] = [
'post_id' => $post_id,
'url' => $url,
'title' => $title,
'h1_if_different' => $h1,
'section_category' => $section_category,
'meta_description' => $meta_description,
'full_article_content' => $full_article_content,
'card_issuer_mentions' => $card_issuer_mentions,
'card_product_mentions' => $card_product_mentions,
'offer_features' => $offer_features,
'existing_tags' => $existing_tags,
'top_keywords' => $top_keywords,
];
}
wp_reset_postdata();
wp_reset_query();
wp_cache_flush();
}
return $articles;
}
/**
* Process batch with ChatGPT
*/
function processWithChatGPT($batch, $tagDefinitions, &$tokenUsage) {
$prompt = buildPrompt($batch, $tagDefinitions);
$schema = buildResponseSchema(count($batch));
$requestData = [
'model' => OPENAI_MODEL,
'messages' => [
[
'role' => 'system',
'content' => 'You are an expert content taxonomist specializing in credit card and personal finance content. Your task is to accurately categorize articles based on their content and assign appropriate tags.'
],
[
'role' => 'user',
'content' => $prompt
]
],
'response_format' => [
'type' => 'json_schema',
'json_schema' => [
'name' => 'article_tagging_response',
'strict' => true,
'schema' => $schema
]
],
'temperature' => 0.3,
'max_completion_tokens' => 1000 // Reduced from 4096 - we only need tags back
];
$result = makeAPICall(
OPENAI_ENDPOINT,
OPENAI_API_KEY,
$requestData,
'ChatGPT',
$batch[0]['url'] ?? 'unknown' // Pass URL for error logging
);
if ($result === false) {
// Return error results for all articles in batch
return array_fill(0, count($batch), [
'tag' => 'MANUAL_REVIEW',
'subtag' => '',
'error' => true
]);
}
// Extract token usage from response
if (isset($result['usage'])) {
$tokenUsage['chatgpt_input'] += $result['usage']['prompt_tokens'] ?? 0;
$tokenUsage['chatgpt_output'] += $result['usage']['completion_tokens'] ?? 0;
}
return parseAPIResponse($result, count($batch));
}
/**
* Process batch with Perplexity
*/
function processWithPerplexity($batch, $tagDefinitions, &$tokenUsage) {
$prompt = buildPrompt($batch, $tagDefinitions);
$schema = buildResponseSchema(count($batch));
$requestData = [
'model' => PERPLEXITY_MODEL,
'messages' => [
[
'role' => 'system',
'content' => 'You are an expert content taxonomist specializing in credit card and personal finance content. Your task is to accurately categorize articles based on their content and assign appropriate tags.'
],
[
'role' => 'user',
'content' => $prompt
]
],
'response_format' => [
'type' => 'json_schema',
'json_schema' => [
'strict' => true,
'schema' => $schema
]
],
'temperature' => 0.3,
'max_tokens' => 1000 // Reduced - we only need tags back
];
$result = makeAPICall(
PERPLEXITY_ENDPOINT,
PERPLEXITY_API_KEY,
$requestData,
'Perplexity',
$batch[0]['url'] ?? 'unknown' // Pass URL for error logging
);
if ($result === false) {
// Return error results for all articles in batch
return array_fill(0, count($batch), [
'tag' => 'MANUAL_REVIEW',
'subtag' => '',
'error' => true
]);
}
// Extract token usage from response
if (isset($result['usage'])) {
$tokenUsage['perplexity_input'] += $result['usage']['prompt_tokens'] ?? 0;
$tokenUsage['perplexity_output'] += $result['usage']['completion_tokens'] ?? 0;
}
return parseAPIResponse($result, count($batch));
}
/**
* Build prompt for API call
*/
function buildPrompt($batch, $tagDefinitions) {
$prompt = "Please analyze the following articles and assign appropriate tags based on the tag definitions provided.\n\n";
// Add tag definitions
$prompt .= "=== MAIN TAG DEFINITIONS ===\n\n";
foreach ($tagDefinitions['main_tags'] as $tagName => $tagInfo) {
$prompt .= "**{$tagName}**\n";
$prompt .= "{$tagInfo['definition']}\n\n";
}
$prompt .= "\n=== SUBTAG DEFINITIONS ===\n\n";
foreach ($tagDefinitions['subtags'] as $mainTag => $subtags) {
$prompt .= "**{$mainTag}** subtags:\n";
foreach ($subtags as $subtagName => $subtagInfo) {
$prompt .= " - {$subtagName}: {$subtagInfo['definition']}\n";
}
$prompt .= "\n";
}
$prompt .= "\n=== TAGGING RULES ===\n\n";
$prompt .= "1. Each article MUST be assigned exactly ONE main tag\n";
$prompt .= "2. Each article MAY be assigned ONE subtag (or none) based on relevance and the article's main tag. For example, an article about a credit card that offers everyday purchases cash back may get the subtag 'Everyday Purchases' if the main tag is 'Cash Back Credit Cards'.\n";
$prompt .= "3. **CRITICAL CONSTRAINT - SUBTAG VALIDATION**: \n";
$prompt .= " - Subtags can ONLY be assigned if they belong to the article's assigned main tag\n";
$prompt .= " - NEVER assign a subtag from a different main tag category\n";
$prompt .= " - Before assigning a subtag, verify: Does this subtag appear in the subtag list for my chosen main tag?\n";
$prompt .= " - If the answer is NO, do not assign that subtag. Leave it empty instead.\n";
$prompt .= " - Example: 'Everyday Purchases' is ONLY valid for 'Cash Back Credit Cards', not for any other main tag\n";
$prompt .= "4. **CRITICAL WEIGHTING**: The article TITLE should carry significantly more weight than the article content when determining tag relevance\n";
$prompt .= " - Title = 60% importance (1.5x the weight of content)\n";
$prompt .= " - Full Article Content = 40% importance\n";
$prompt .= "5. First, determine what the title indicates the article is about\n";
$prompt .= "6. Then, read the full article content to confirm and refine your understanding\n";
$prompt .= "7. If the title and content suggest different tags, prioritize the title\n";
$prompt .= "8. Use meta description and section category as minor supporting signals only\n\n";
$prompt .= "**REMINDER**: Before finalizing your response, verify that any assigned subtag belongs to the article's assigned main tag. Cross-check the subtag definitions above.\n\n";
$prompt .= "=== ARTICLES TO TAG ===\n\n";
foreach ($batch as $index => $article) {
$articleNum = $index + 1;
$prompt .= "Article #{$articleNum}:\n";
$prompt .= "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n";
$prompt .= "**TITLE (60% WEIGHT)**: {$article['title']}\n";
$prompt .= "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n";
$prompt .= "**FULL ARTICLE CONTENT (40% WEIGHT)**:\n";
if (!empty($article['full_article_content'])) {
$prompt .= $article['full_article_content'] . "\n\n";
}
$prompt .= "**SUPPORTING METADATA**:\n";
$prompt .= "URL: {$article['url']}\n";
if (!empty($article['meta_description'])) {
$prompt .= "Meta Description: {$article['meta_description']}\n";
}
if (!empty($article['section_category'])) {
$prompt .= "Section: {$article['section_category']}\n";
}
$prompt .= "\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n";
}
$prompt .= "REMINDER: The article TITLE should be your PRIMARY signal (60% weight). Use the full article content to confirm and understand context (40% weight), but if there's any conflict, the title takes precedence.";
return $prompt;
}
/**
* Build JSON schema for API response
*/
function buildResponseSchema($articleCount) {
$properties = [];
for ($i = 1; $i <= $articleCount; $i++) {
$properties["article_{$i}"] = [
'type' => 'object',
'properties' => [
'main_tag' => [
'type' => 'string',
'description' => 'The main tag assigned to this article'
],
'subtag' => [
'type' => 'string',
'description' => 'The subtag assigned to this article (empty string if none)'
]
],
'required' => ['main_tag', 'subtag'],
'additionalProperties' => false
];
}
return [
'type' => 'object',
'properties' => $properties,
'required' => array_keys($properties),
'additionalProperties' => false
];
}
/**
* Make API call with exponential backoff retry
*/
function makeAPICall($endpoint, $apiKey, $requestData, $providerName, $articleUrl = 'unknown') {
$headers = [
'Authorization: Bearer ' . $apiKey,
'Content-Type: application/json'
];
$attempt = 0;
while ($attempt < MAX_RETRIES) {
$attempt++;
$ch = curl_init($endpoint);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($requestData));
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 300); // Increased from 120 to 300 seconds for longer articles
$response = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$error = curl_error($ch);
curl_close($ch);
if ($error) {
echo " ⚠️ {$providerName} API error (attempt {$attempt}) for {$articleUrl}: {$error}\n";
} elseif ($httpCode === 200) {
return json_decode($response, true);
} elseif ($httpCode === 429) {
// Rate limit - wait longer
$waitTime = pow(2, $attempt) * 5;
echo " ⚠️ {$providerName} rate limit hit (attempt {$attempt}) for {$articleUrl}. Waiting {$waitTime}s...\n";
sleep($waitTime);
continue;
} elseif ($httpCode === 413) {
// Payload too large
echo " ⚠️ {$providerName} payload too large (attempt {$attempt}) for {$articleUrl} - article content may be too long\n";
echo " Try reducing MAX_ARTICLE_WORDS in the script\n";
return false; // Don't retry payload too large errors
} else {
echo " ⚠️ {$providerName} API returned status {$httpCode} (attempt {$attempt}) for {$articleUrl}\n";
$errorData = json_decode($response, true);
if ($errorData && isset($errorData['error']['message'])) {
echo " Error: {$errorData['error']['message']}\n";
} elseif ($response) {
echo " Response: " . substr($response, 0, 500) . "\n";
}
}
// Exponential backoff for retries
if ($attempt < MAX_RETRIES) {
$waitTime = pow(2, $attempt);
echo " ⏳ Retrying in {$waitTime}s...\n";
sleep($waitTime);
}
}
echo " ❌ {$providerName} API failed after " . MAX_RETRIES . " attempts for {$articleUrl}\n";
return false;
}
/**
* Parse API response
*/
function parseAPIResponse($apiResponse, $expectedCount) {
if (!isset($apiResponse['choices'][0]['message']['content'])) {
return array_fill(0, $expectedCount, [
'tag' => 'MANUAL_REVIEW',
'subtag' => '',
'error' => true
]);
}
$content = json_decode($apiResponse['choices'][0]['message']['content'], true);
if (!$content) {
return array_fill(0, $expectedCount, [
'tag' => 'MANUAL_REVIEW',
'subtag' => '',
'error' => true
]);
}
$results = [];
for ($i = 1; $i <= $expectedCount; $i++) {
$key = "article_{$i}";
if (isset($content[$key])) {
$results[] = [
'tag' => $content[$key]['main_tag'] ?? 'MANUAL_REVIEW',
'subtag' => $content[$key]['subtag'] ?? '',
'error' => false
];
} else {
$results[] = [
'tag' => 'MANUAL_REVIEW',
'subtag' => '',
'error' => true
];
}
}
return $results;
}
/**
* Combine results from both APIs
*/
function combineResults($batch, $chatgptResults, $perplexityResults) {
$combined = [];
foreach ($batch as $index => $article) {
// Start with base columns
$row = [
'post_id' => $article['post_id'],
'url' => $article['url'],
'title' => $article['title']
];
// Add ChatGPT columns if enabled
if (USE_CHATGPT) {
$chatgpt = $chatgptResults ? $chatgptResults[$index] : ['tag' => 'N/A', 'subtag' => ''];
$row['chatgpt_tag'] = $chatgpt['tag'];
$row['chatgpt_subtag'] = $chatgpt['subtag'];
}
// Add Perplexity columns if enabled
if (USE_PERPLEXITY) {
$perplexity = $perplexityResults ? $perplexityResults[$index] : ['tag' => 'N/A', 'subtag' => ''];
$row['perplexity_tag'] = $perplexity['tag'];
$row['perplexity_subtag'] = $perplexity['subtag'];
}
// Add tags_match column only if both LLMs are enabled
if (USE_CHATGPT && USE_PERPLEXITY) {
$chatgpt = $chatgptResults ? $chatgptResults[$index] : ['tag' => 'N/A', 'subtag' => ''];
$perplexity = $perplexityResults ? $perplexityResults[$index] : ['tag' => 'N/A', 'subtag' => ''];
$tagsMatch = ($chatgpt['tag'] === $perplexity['tag'] &&
$chatgpt['subtag'] === $perplexity['subtag']) ? 'yes' : 'no';
$row['tags_match'] = $tagsMatch;
}
$combined[] = $row;
}
return $combined;
}
/**
* Append results to output CSV
*/
function appendToOutputCSV($results) {
$fileExists = file_exists(OUTPUT_CSV);
$file = fopen(OUTPUT_CSV, 'a');
if (!$file) {
echo " ❌ Failed to open output file\n";
return;
}
// Write header if file is new
if (!$fileExists) {
// Build header dynamically based on enabled LLMs
$header = ['post_id', 'url', 'title'];
if (USE_CHATGPT) {
$header[] = 'chatgpt_tag';
$header[] = 'chatgpt_subtag';
}
if (USE_PERPLEXITY) {
$header[] = 'perplexity_tag';
$header[] = 'perplexity_subtag';
}
// Only include tags_match if both LLMs are enabled
if (USE_CHATGPT && USE_PERPLEXITY) {
$header[] = 'tags_match';
}
fputcsv($file, $header);
}
// Write results
foreach ($results as $result) {
fputcsv($file, $result);
}
fclose($file);
}
/**
* Generate tag count summaries from the output CSV
*/
function generateTagCounts() {
if (!file_exists(OUTPUT_CSV)) {
echo " ⚠️ No output CSV found to generate counts\n";
return;
}
// Read all results from output CSV
$file = fopen(OUTPUT_CSV, 'r');
if (!$file) {
echo " ❌ Failed to open output CSV\n";
return;
}
// Read header to determine column positions
$header = fgetcsv($file);
if (!$header) {
echo " ❌ Failed to read CSV header\n";
fclose($file);
return;
}
// Find column indices dynamically
$columnMap = array_flip($header);
$chatgptTagIdx = $columnMap['chatgpt_tag'] ?? null;
$chatgptSubtagIdx = $columnMap['chatgpt_subtag'] ?? null;
$perplexityTagIdx = $columnMap['perplexity_tag'] ?? null;
$perplexitySubtagIdx = $columnMap['perplexity_subtag'] ?? null;
// Initialize counters
$chatgptCounts = [];
$perplexityCounts = [];
// Read and count
while (($row = fgetcsv($file)) !== false) {
// Count ChatGPT tags (only if ChatGPT columns exist)
if ($chatgptTagIdx !== null && USE_CHATGPT) {
$chatgptTag = $row[$chatgptTagIdx] ?? '';
$chatgptSubtag = $chatgptSubtagIdx !== null ? ($row[$chatgptSubtagIdx] ?? '') : '';
if (!empty($chatgptTag) && $chatgptTag !== 'MANUAL_REVIEW' && $chatgptTag !== 'N/A') {
// Main tag count
$mainKey = $chatgptTag . '||';
if (!isset($chatgptCounts[$mainKey])) {
$chatgptCounts[$mainKey] = ['tag' => $chatgptTag, 'subtag' => '', 'count' => 0];
}
$chatgptCounts[$mainKey]['count']++;
// Subtag count
if (!empty($chatgptSubtag)) {
$subKey = $chatgptTag . '||' . $chatgptSubtag;
if (!isset($chatgptCounts[$subKey])) {
$chatgptCounts[$subKey] = ['tag' => $chatgptTag, 'subtag' => $chatgptSubtag, 'count' => 0];
}
$chatgptCounts[$subKey]['count']++;
}
}
}
// Count Perplexity tags (only if Perplexity columns exist)
if ($perplexityTagIdx !== null && USE_PERPLEXITY) {
$perplexityTag = $row[$perplexityTagIdx] ?? '';
$perplexitySubtag = $perplexitySubtagIdx !== null ? ($row[$perplexitySubtagIdx] ?? '') : '';
if (!empty($perplexityTag) && $perplexityTag !== 'MANUAL_REVIEW' && $perplexityTag !== 'N/A') {
// Main tag count
$mainKey = $perplexityTag . '||';
if (!isset($perplexityCounts[$mainKey])) {
$perplexityCounts[$mainKey] = ['tag' => $perplexityTag, 'subtag' => '', 'count' => 0];
}
$perplexityCounts[$mainKey]['count']++;
// Subtag count
if (!empty($perplexitySubtag)) {
$subKey = $perplexityTag . '||' . $perplexitySubtag;
if (!isset($perplexityCounts[$subKey])) {
$perplexityCounts[$subKey] = ['tag' => $perplexityTag, 'subtag' => $perplexitySubtag, 'count' => 0];
}
$perplexityCounts[$subKey]['count']++;
}
}
}
}
fclose($file);
// Sort counts by tag name, then by subtag
usort($chatgptCounts, function($a, $b) {
if ($a['tag'] !== $b['tag']) {
return strcmp($a['tag'], $b['tag']);
}
return strcmp($a['subtag'], $b['subtag']);
});
usort($perplexityCounts, function($a, $b) {
if ($a['tag'] !== $b['tag']) {
return strcmp($a['tag'], $b['tag']);
}
return strcmp($a['subtag'], $b['subtag']);
});
// Write ChatGPT counts (only if ChatGPT is enabled)
if (USE_CHATGPT && !empty($chatgptCounts)) {
$chatgptCountsFile = __DIR__ . '/output/chatgpt-tag-counts.csv';
$file = fopen($chatgptCountsFile, 'w');
if ($file) {
fputcsv($file, ['Main Tag', 'Subtag', 'Article Count']);
foreach ($chatgptCounts as $count) {
fputcsv($file, [$count['tag'], $count['subtag'], $count['count']]);
}
fclose($file);
echo " ✅ ChatGPT counts saved\n";
}
} elseif (USE_CHATGPT) {
echo " ⚠️ No ChatGPT tags to count\n";
} else {
echo " ⏭️ ChatGPT disabled - no counts to generate\n";
}
// Write Perplexity counts (only if Perplexity is enabled)
if (USE_PERPLEXITY && !empty($perplexityCounts)) {
$perplexityCountsFile = __DIR__ . '/output/perplexity-tag-counts.csv';
$file = fopen($perplexityCountsFile, 'w');
if ($file) {
fputcsv($file, ['Main Tag', 'Subtag', 'Article Count']);
foreach ($perplexityCounts as $count) {
fputcsv($file, [$count['tag'], $count['subtag'], $count['count']]);
}
fclose($file);
echo " ✅ Perplexity counts saved\n";
}
} elseif (USE_PERPLEXITY) {
echo " ⚠️ No Perplexity tags to count\n";
} else {
echo " ⏭️ Perplexity disabled - no counts to generate\n";
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment