Skip to content

Instantly share code, notes, and snippets.

@db-pj
Created December 2, 2025 15:23
Show Gist options
  • Select an option

  • Save db-pj/da501c255b2c94420569c963ae96aefa to your computer and use it in GitHub Desktop.

Select an option

Save db-pj/da501c255b2c94420569c963ae96aefa to your computer and use it in GitHub Desktop.
<?php
/**
* Tag Generation Script for BadCredit.org
*
* Analyzes all published posts using OpenAI API to generate a comprehensive
* tag taxonomy with main tags and subtags for SEO and content organization.
*
* Output: JSON file in volumes/phpfpm/scripts/tags/output/
*/
// WordPress Bootstrap
$public_html = getenv('PUBLIC_HTML');
if (!$public_html || !file_exists($public_html . '/wp-load.php')) {
echo "❌ Error: PUBLIC_HTML environment variable not set or wp-load.php not found\n";
exit(1);
}
require_once($public_html . '/wp-load.php');
// Configuration
const OPENAI_API_KEY = '';
const OPENAI_MODEL = 'gpt-4o';
const BATCH_SIZE = 10; // Process posts in batches
const OUTPUT_DIR = __DIR__ . '/output/';
// Ensure output directory exists
if (!is_dir(OUTPUT_DIR)) {
mkdir(OUTPUT_DIR, 0755, true);
}
echo "🏁 Starting Tag Generation Process\n";
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n";
/**
* Query all published posts
*/
function get_all_posts() {
$args = array(
'post_type' => 'post',
'post_status' => 'publish',
'posts_per_page' => -1,
'orderby' => 'date',
'order' => 'DESC',
);
$query = new WP_Query($args);
return $query->posts;
}
/**
* Extract clean text from post content
*/
function extract_clean_content($post) {
$content = $post->post_content;
// Remove shortcodes
$content = strip_shortcodes($content);
// Remove HTML tags
$content = wp_strip_all_tags($content);
// Remove extra whitespace
$content = preg_replace('/\s+/', ' ', $content);
// Limit to first 2000 characters for API efficiency
$content = substr($content, 0, 2000);
return trim($content);
}
/**
* Get post metadata
*/
function get_post_metadata($post) {
$categories = wp_get_post_categories($post->ID, array('fields' => 'names'));
$tags = wp_get_post_tags($post->ID, array('fields' => 'names'));
return array(
'id' => $post->ID,
'title' => $post->post_title,
'excerpt' => get_the_excerpt($post),
'content' => extract_clean_content($post),
'categories' => $categories,
'existing_tags' => $tags,
'date' => $post->post_date,
);
}
/**
* Call OpenAI API to analyze posts and suggest tags
*/
function analyze_posts_batch($posts_data) {
$prompt = "You are analyzing content from BadCredit.org, a financial advice website focused on credit cards, loans, credit repair, and financial products for people with bad credit.
Analyze the following " . count($posts_data) . " articles and suggest relevant tags that would help with SEO and content organization.
For each article, suggest 3-5 tags that capture:
- The main topic (e.g., Credit Cards, Personal Loans, Credit Score)
- Specific subtopics (e.g., Balance Transfer, APR, Rewards)
- Target audience aspects (e.g., Bad Credit, First-Time, Students)
Articles to analyze:
" . json_encode($posts_data, JSON_PRETTY_PRINT) . "
Respond with ONLY a JSON object in this exact format:
{
\"posts\": [
{
\"post_id\": 123,
\"suggested_tags\": [\"Tag 1\", \"Tag 2\", \"Tag 3\"]
}
]
}";
$data = array(
'model' => OPENAI_MODEL,
'messages' => array(
array('role' => 'system', 'content' => 'You are a content taxonomy expert specializing in financial websites. Respond only with valid JSON.'),
array('role' => 'user', 'content' => $prompt)
),
'temperature' => 0.3,
'max_tokens' => 2000,
);
$ch = curl_init('https://api.openai.com/v1/chat/completions');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($data));
curl_setopt($ch, CURLOPT_HTTPHEADER, array(
'Content-Type: application/json',
'Authorization: Bearer ' . OPENAI_API_KEY
));
$response = curl_exec($ch);
$http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if ($http_code !== 200) {
echo "⚠️ API Error (HTTP $http_code): $response\n";
return null;
}
$result = json_decode($response, true);
if (isset($result['choices'][0]['message']['content'])) {
$content = $result['choices'][0]['message']['content'];
// Remove markdown code blocks if present
$content = preg_replace('/```json\s*/', '', $content);
$content = preg_replace('/```\s*$/', '', $content);
return json_decode($content, true);
}
return null;
}
/**
* Aggregate all tags and count frequencies
*/
function aggregate_tags($all_suggestions) {
$tag_frequency = array();
$tag_contexts = array(); // Store which posts use each tag
foreach ($all_suggestions as $suggestion) {
if (!isset($suggestion['suggested_tags'])) continue;
foreach ($suggestion['suggested_tags'] as $tag) {
$tag = trim($tag);
if (empty($tag)) continue;
if (!isset($tag_frequency[$tag])) {
$tag_frequency[$tag] = 0;
$tag_contexts[$tag] = array();
}
$tag_frequency[$tag]++;
$tag_contexts[$tag][] = $suggestion['post_id'];
}
}
// Sort by frequency
arsort($tag_frequency);
return array(
'frequencies' => $tag_frequency,
'contexts' => $tag_contexts
);
}
/**
* Generate parent tags and subtags using AI
*/
function generate_taxonomy($aggregated_data) {
$frequencies = $aggregated_data['frequencies'];
// Get top tags
$top_tags = array_slice($frequencies, 0, 100, true);
$prompt = "You are organizing a tag taxonomy for BadCredit.org, a financial advice website focused on helping people with bad credit access financial products.
Based on the following tags and their frequencies, create a hierarchical taxonomy with:
- 20-30 main parent tags (broad categories)
- 1-6 subtags under each parent (specific topics)
Tag frequency data:
" . json_encode($top_tags, JSON_PRETTY_PRINT) . "
Create a taxonomy that:
1. Groups related tags logically
2. Has clear parent-child relationships
3. Covers the main topics on the site
4. Helps with SEO and content organization
5. Uses clear, descriptive, and SPECIFIC definitions
CRITICAL: Tag definitions will be used in the next phase to automatically assign tags to articles. Each definition must:
- Be specific enough to determine if an article matches the tag
- Include key concepts, keywords, and topics that belong to this tag
- Describe the scope clearly (what IS included and what is NOT)
- Be 2-3 sentences with concrete details
- Focus on content characteristics, not just broad categories
Example of GOOD definitions:
\"Credit Card Rewards\": \"Articles discussing credit card rewards programs, including points systems, cash back offers, travel rewards, redemption strategies, reward optimization tips, and comparisons of rewards programs. Covers maximizing rewards, understanding reward value, and pairing multiple cards for benefits.\"
Example of BAD definitions:
\"Credit Cards\": \"Articles about credit cards.\" (Too vague, not actionable)
Respond with ONLY a JSON object in this EXACT format:
{
\"main_tags\": {
\"Parent Tag Name\": {
\"definition\": \"Detailed 2-3 sentence description with specific keywords, topics, and scope that clearly defines what articles should receive this tag. Include concrete examples of article topics.\"
}
},
\"subtags\": {
\"Parent Tag Name\": {
\"Subtag Name\": {
\"definition\": \"Specific 2-3 sentence description with clear criteria for matching articles to this subtag. Include keywords and specific topics covered.\"
}
}
}
}
IMPORTANT:
- Use the exact field names: 'main_tags' and 'subtags'
- Each tag must have a detailed, specific 'definition' field (2-3 sentences minimum)
- Keep parent tag names broad (e.g., 'Credit Cards', 'Personal Loans')
- Make subtags specific (e.g., 'Balance Transfer', 'Secured Cards')
- Definitions must be actionable for automated tag assignment";
$data = array(
'model' => OPENAI_MODEL,
'messages' => array(
array('role' => 'system', 'content' => 'You are a content taxonomy expert. Respond only with valid JSON in the exact format requested.'),
array('role' => 'user', 'content' => $prompt)
),
'temperature' => 0.2,
'max_tokens' => 3000,
);
$ch = curl_init('https://api.openai.com/v1/chat/completions');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($data));
curl_setopt($ch, CURLOPT_HTTPHEADER, array(
'Content-Type: application/json',
'Authorization: Bearer ' . OPENAI_API_KEY
));
$response = curl_exec($ch);
$http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if ($http_code !== 200) {
echo "⚠️ API Error (HTTP $http_code): $response\n";
return null;
}
$result = json_decode($response, true);
if (isset($result['choices'][0]['message']['content'])) {
$content = $result['choices'][0]['message']['content'];
// Remove markdown code blocks if present
$content = preg_replace('/```json\s*/', '', $content);
$content = preg_replace('/```\s*$/', '', $content);
$taxonomy = json_decode($content, true);
// Validate structure
if (isset($taxonomy['main_tags']) && isset($taxonomy['subtags'])) {
return $taxonomy;
} else {
echo "⚠️ Invalid taxonomy structure returned\n";
return null;
}
}
return null;
}
// ============================================================================
// MAIN EXECUTION
// ============================================================================
echo "📚 Step 1: Querying posts...\n";
$posts = get_all_posts();
$total_posts = count($posts);
echo " Found $total_posts posts\n\n";
echo "📊 Step 2: Analyzing posts with OpenAI API...\n";
echo " (This may take 10-15 minutes for $total_posts posts)\n";
$batches = array_chunk($posts, BATCH_SIZE);
$total_batches = count($batches);
$current_batch = 0;
$all_suggestions = array();
foreach ($batches as $batch) {
$current_batch++;
$batch_to_process = array();
foreach ($batch as $post) {
$batch_to_process[] = get_post_metadata($post);
}
echo " 🔄 Processing batch $current_batch/$total_batches (" . count($batch_to_process) . " posts)...\n";
$result = analyze_posts_batch($batch_to_process);
if ($result && isset($result['posts'])) {
$all_suggestions = array_merge($all_suggestions, $result['posts']);
echo " ✅ Batch complete\n";
} else {
echo " ⚠️ Batch failed, skipping...\n";
}
// Rate limiting - wait 2 seconds between batches
if ($current_batch < $total_batches) {
sleep(2);
}
}
echo "\n📈 Step 3: Aggregating tags...\n";
$aggregated = aggregate_tags($all_suggestions);
$unique_tags = count($aggregated['frequencies']);
echo " Found $unique_tags unique tags across all posts\n\n";
echo "🏗️ Step 4: Generating hierarchical taxonomy...\n";
$taxonomy = generate_taxonomy($aggregated);
if ($taxonomy) {
$main_tag_count = count($taxonomy['main_tags']);
$subtag_count = 0;
foreach ($taxonomy['subtags'] as $parent => $children) {
$subtag_count += count($children);
}
echo " ✅ Generated $main_tag_count main tags and $subtag_count subtags\n\n";
echo "💾 Step 5: Saving output...\n";
$timestamp = date('Y-m-d-His');
$output_file = OUTPUT_DIR . "tag-definitions-{$timestamp}.json";
file_put_contents($output_file, json_encode($taxonomy, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES));
echo " ✅ Saved to: $output_file\n\n";
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n";
echo "✨ Tag generation complete!\n\n";
echo "📋 Summary:\n";
echo " Total posts analyzed: $total_posts\n";
echo " Unique tags found: $unique_tags\n";
echo " Main tags: $main_tag_count\n";
echo " Subtags: $subtag_count\n";
echo " Output: $output_file\n\n";
// Show top 10 main tags
echo "🏆 Top Main Tags:\n";
$count = 0;
foreach ($taxonomy['main_tags'] as $tag => $data) {
echo " • $tag\n";
$count++;
if ($count >= 10) break;
}
} else {
echo "❌ Failed to generate taxonomy\n";
exit(1);
}
echo "\n✅ Done!\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment