Skip to content

Instantly share code, notes, and snippets.

@db-pj
Created December 2, 2025 15:23
Show Gist options
  • Select an option

  • Save db-pj/a02c22c42b0f1f7ed90f6bc603b91096 to your computer and use it in GitHub Desktop.

Select an option

Save db-pj/a02c22c42b0f1f7ed90f6bc603b91096 to your computer and use it in GitHub Desktop.
<?php
/**
* Tag Assignment Script for BadCredit.org
*
* Analyzes all published posts and assigns one main tag and optionally one subtag
* based on the tag definitions generated by generate-tags.php
*
* Output: CSV file with post assignments in volumes/phpfpm/scripts/tags/output/
*/
// WordPress Bootstrap
$public_html = getenv('PUBLIC_HTML');
if (!$public_html || !file_exists($public_html . '/wp-load.php')) {
echo "L Error: PUBLIC_HTML environment variable not set or wp-load.php not found\n";
exit(1);
}
require_once($public_html . '/wp-load.php');
// Configuration
const OPENAI_API_KEY = '';
const OPENAI_MODEL = 'gpt-4o';
const BATCH_SIZE = 20; // Process posts in batches
const DEV_MODE = true; // If true, only process 50 posts for testing
const OUTPUT_DIR = __DIR__ . '/output/';
const TAG_DEFINITIONS_FILE = __DIR__ . '/output/tag-definitions.json';
// Ensure output directory exists
if (!is_dir(OUTPUT_DIR)) {
mkdir(OUTPUT_DIR, 0755, true);
}
echo "Starting Tag Assignment Process\n";
if (DEV_MODE) {
echo "*** DEV MODE: Only processing 50 posts ***\n";
}
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n";
/**
* Load tag definitions from JSON file
*/
function load_tag_definitions() {
if (!file_exists(TAG_DEFINITIONS_FILE)) {
echo "L Error: Tag definitions file not found at " . TAG_DEFINITIONS_FILE . "\n";
echo " Please run generate-tags.php first to create the tag definitions.\n";
exit(1);
}
$json = file_get_contents(TAG_DEFINITIONS_FILE);
$definitions = json_decode($json, true);
if (!$definitions || !isset($definitions['main_tags']) || !isset($definitions['subtags'])) {
echo "L Error: Invalid tag definitions format\n";
exit(1);
}
return $definitions;
}
/**
* Query all published posts
*/
function get_all_posts() {
$limit = DEV_MODE ? 100 : -1;
$args = array(
'post_type' => 'post',
'post_status' => 'publish',
'posts_per_page' => $limit,
'orderby' => 'date',
'order' => 'DESC',
);
$query = new WP_Query($args);
return $query->posts;
}
/**
* Extract clean text from post content
*/
function extract_clean_content($post) {
$content = $post->post_content;
// Remove shortcodes
$content = strip_shortcodes($content);
// Remove HTML tags
$content = wp_strip_all_tags($content);
// Remove extra whitespace
$content = preg_replace('/\s+/', ' ', $content);
// Limit to first 3000 characters for API efficiency
$content = substr($content, 0, 3000);
return trim($content);
}
/**
* Get post data for assignment
*/
function get_post_data($post) {
$categories = wp_get_post_categories($post->ID, array('fields' => 'names'));
return array(
'post_id' => $post->ID,
'title' => $post->post_title,
'url' => get_permalink($post->ID),
'content' => extract_clean_content($post),
'categories' => $categories,
);
}
/**
* Build prompt for tag assignment
*/
function build_prompt($posts_data, $tag_definitions) {
$main_tags_list = "";
foreach ($tag_definitions['main_tags'] as $tag => $data) {
$main_tags_list .= "- **$tag**: {$data['definition']}\n";
}
$subtags_list = "";
foreach ($tag_definitions['subtags'] as $parent => $subtags) {
$subtags_list .= "\n**$parent** subtags:\n";
foreach ($subtags as $subtag => $data) {
$subtags_list .= " - **$subtag**: {$data['definition']}\n";
}
}
$articles_json = json_encode($posts_data, JSON_PRETTY_PRINT);
return "You are a content categorization expert for BadCredit.org, a financial advice website focused on helping people with bad credit.
Your task is to assign ONE main tag and OPTIONALLY one subtag to each article based on its content.
## Available Tags:
### Main Tags:
$main_tags_list
### Subtags (organized by parent):
$subtags_list
## Assignment Rules:
1. **Prioritize the title heavily (60% weight)** - The article title is the most important indicator of its primary topic
2. Consider content (40% weight) - The article body provides supporting context
3. **Assign exactly ONE main tag** - Choose the single most relevant tag
4. **Optionally assign ONE subtag** - Only if the article clearly focuses on that specific subtopic
5. **Subtag must match parent** - If you assign a subtag, it must be a child of the assigned main tag
6. If unsure between tags, choose the more specific one
7. If an article doesn't clearly fit any subtag, leave subtag empty
## Articles to Categorize:
$articles_json
## Response Format:
Respond with ONLY a valid JSON object in this exact format:
{
\"assignments\": [
{
\"post_id\": 123,
\"main_tag\": \"Tag Name\",
\"subtag\": \"Subtag Name\"
},
{
\"post_id\": 124,
\"main_tag\": \"Tag Name\",
\"subtag\": \"\"
}
]
}
IMPORTANT:
- Use exact tag names from the definitions above
- Every article must have a main_tag
- subtag can be empty string \"\" if not applicable
- Respond ONLY with valid JSON, no other text";
}
/**
* Call OpenAI API to assign tags
*/
function assign_tags_batch($posts_data, $tag_definitions) {
$prompt = build_prompt($posts_data, $tag_definitions);
$data = array(
'model' => OPENAI_MODEL,
'messages' => array(
array('role' => 'system', 'content' => 'You are a content categorization expert. Respond only with valid JSON.'),
array('role' => 'user', 'content' => $prompt)
),
'temperature' => 0.2,
'max_tokens' => 2000,
);
$ch = curl_init('https://api.openai.com/v1/chat/completions');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($data));
curl_setopt($ch, CURLOPT_HTTPHEADER, array(
'Content-Type: application/json',
'Authorization: Bearer ' . OPENAI_API_KEY
));
$response = curl_exec($ch);
$http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if ($http_code !== 200) {
echo "API Error (HTTP $http_code): $response\n";
return null;
}
$result = json_decode($response, true);
if (isset($result['choices'][0]['message']['content'])) {
$content = $result['choices'][0]['message']['content'];
// Remove markdown code blocks if present
$content = preg_replace('/```json\s*/', '', $content);
$content = preg_replace('/```\s*$/', '', $content);
$assignments = json_decode($content, true);
if (isset($assignments['assignments'])) {
return $assignments['assignments'];
}
}
return null;
}
/**
* Write assignments to CSV
*/
function write_csv($all_assignments, $output_file) {
$fp = fopen($output_file, 'w');
// Write header
fputcsv($fp, array('post_id', 'url', 'title', 'main_tag', 'subtag'));
// Write data
foreach ($all_assignments as $assignment) {
fputcsv($fp, array(
$assignment['post_id'],
$assignment['url'],
$assignment['title'],
$assignment['main_tag'],
$assignment['subtag'] ?? ''
));
}
fclose($fp);
}
/**
* Generate tag count summary
*/
function generate_tag_counts($all_assignments, $output_file) {
$main_counts = array();
$subtag_counts = array();
// Count main tags and subtags separately
foreach ($all_assignments as $assignment) {
$main = $assignment['main_tag'];
$sub = $assignment['subtag'] ?? '';
// Count main tag
if (!isset($main_counts[$main])) {
$main_counts[$main] = 0;
}
$main_counts[$main]++;
// Count subtag if present
if (!empty($sub)) {
if (!isset($subtag_counts[$main])) {
$subtag_counts[$main] = array();
}
if (!isset($subtag_counts[$main][$sub])) {
$subtag_counts[$main][$sub] = 0;
}
$subtag_counts[$main][$sub]++;
}
}
// Sort main tags by count descending
arsort($main_counts);
$fp = fopen($output_file, 'w');
fputcsv($fp, array('Main Tag', 'Subtag', 'Article Count'));
// Write data: parent tag followed by its subtags
foreach ($main_counts as $main_tag => $main_count) {
// Write parent tag row (with empty subtag)
fputcsv($fp, array($main_tag, '', $main_count));
// Write subtag rows if they exist
if (isset($subtag_counts[$main_tag])) {
// Sort subtags by count descending
arsort($subtag_counts[$main_tag]);
foreach ($subtag_counts[$main_tag] as $subtag => $subtag_count) {
fputcsv($fp, array($main_tag, $subtag, $subtag_count));
}
}
}
fclose($fp);
}
// ============================================================================
// MAIN EXECUTION
// ============================================================================
echo "Step 1: Loading tag definitions...\n";
$tag_definitions = load_tag_definitions();
$main_tag_count = count($tag_definitions['main_tags']);
$subtag_count = 0;
foreach ($tag_definitions['subtags'] as $children) {
$subtag_count += count($children);
}
echo " Loaded $main_tag_count main tags and $subtag_count subtags\n\n";
echo "Step 2: Querying posts...\n";
$posts = get_all_posts();
$total_posts = count($posts);
if (DEV_MODE) {
echo " Found $total_posts posts (DEV MODE - limited to 100)\n\n";
} else {
echo " Found $total_posts posts\n\n";
}
echo "Step 3: Assigning tags with OpenAI API...\n";
echo " (This may take 5-10 minutes for $total_posts posts)\n";
$batches = array_chunk($posts, BATCH_SIZE);
$total_batches = count($batches);
$current_batch = 0;
$all_assignments = array();
// Create a map for quick lookup
$post_data_map = array();
foreach ($posts as $post) {
$data = get_post_data($post);
$post_data_map[$post->ID] = $data;
}
foreach ($batches as $batch) {
$current_batch++;
// Prepare batch data (without URL for API, to save tokens)
$batch_for_api = array();
foreach ($batch as $post) {
$batch_for_api[] = array(
'post_id' => $post->ID,
'title' => $post->post_title,
'content' => extract_clean_content($post),
'categories' => wp_get_post_categories($post->ID, array('fields' => 'names')),
);
}
echo "Processing batch $current_batch/$total_batches (" . count($batch_for_api) . " posts)...\n";
$assignments = assign_tags_batch($batch_for_api, $tag_definitions);
if ($assignments) {
// Merge with full post data
foreach ($assignments as $assignment) {
$post_id = $assignment['post_id'];
if (isset($post_data_map[$post_id])) {
$all_assignments[] = array(
'post_id' => $post_id,
'url' => $post_data_map[$post_id]['url'],
'title' => $post_data_map[$post_id]['title'],
'main_tag' => $assignment['main_tag'],
'subtag' => $assignment['subtag'] ?? '',
);
}
}
echo " Batch complete\n";
} else {
echo " Batch failed, skipping...\n";
}
// Rate limiting - wait 2 seconds between batches
if ($current_batch < $total_batches) {
sleep(2);
}
}
echo "\n Step 4: Saving results...\n";
$timestamp = date('Y-m-d-His');
$assignments_file = OUTPUT_DIR . "article-tag-assignments-{$timestamp}.csv";
$counts_file = OUTPUT_DIR . "tag-counts-{$timestamp}.csv";
write_csv($all_assignments, $assignments_file);
echo " Assignments saved to: $assignments_file\n";
generate_tag_counts($all_assignments, $counts_file);
echo " Tag counts saved to: $counts_file\n\n";
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n";
echo "( Tag assignment complete!\n\n";
echo " Summary:\n";
echo " Total posts processed: " . count($all_assignments) . " / $total_posts\n";
echo " Assignments file: $assignments_file\n";
echo " Tag counts file: $counts_file\n\n";
// Show top 10 assigned tags
echo " Top 10 Most Assigned Tags:\n";
$counts = array();
foreach ($all_assignments as $assignment) {
$main = $assignment['main_tag'];
if (!isset($counts[$main])) {
$counts[$main] = 0;
}
$counts[$main]++;
}
arsort($counts);
$count = 0;
foreach ($counts as $tag => $num) {
echo " • $tag: $num articles\n";
$count++;
if ($count >= 10) break;
}
echo "\nDone!\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment