Skip to content

Instantly share code, notes, and snippets.

@Jungwoo-An
Created September 26, 2025 08:42
Show Gist options
  • Select an option

  • Save Jungwoo-An/eac5ef0f191fae1d441a8a192b0cda49 to your computer and use it in GitHub Desktop.

Select an option

Save Jungwoo-An/eac5ef0f191fae1d441a8a192b0cda49 to your computer and use it in GitHub Desktop.
test
import dayjs from 'dayjs'
import { createApifyClient } from './apify'
export const searchPostsByQuery = async ({
query,
afterDate,
beforeDate,
maxPagesPerQuery = 10,
resultsPerPage = 20,
}: {
query: string
maxPagesPerQuery: number
resultsPerPage: number
afterDate?: string
beforeDate?: string
}) => {
const client = createApifyClient()
const { defaultDatasetId } = await client.actor('apify/google-search-scraper').call({
focusOnPaidAds: false,
forceExactMatch: false,
includeIcons: false,
...(beforeDate && { beforeDate }),
...(afterDate && { afterDate }),
includeUnfilteredResults: false,
maxPagesPerQuery,
maximumLeadsEnrichmentRecords: 0,
mobileResults: false,
queries: `${query} site:instagram.com`,
resultsPerPage,
saveHtml: false,
saveHtmlToKeyValueStore: true,
aiMode: 'aiModeOff',
searchLanguage: '',
languageCode: '',
wordsInTitle: [],
wordsInText: [],
wordsInUrl: [],
})
const { items } = await client.dataset(defaultDatasetId).listItems()
return items.flatMap(x => x.organicResults)
}
import { createApifyClient } from './apify'
export const getInstagramProfiles = async (usernames: string[]) => {
const client = createApifyClient()
const { defaultDatasetId } = await client.actor('apify/instagram-profile-scraper').call({
usernames,
})
const { items } = await client.dataset(defaultDatasetId).listItems()
return items.map((x: any) => ({
username: x.username,
fullName: x.fullName,
biography: x.biography,
followersCount: x.followersCount,
profileURL: x.profilePicUrlHD,
}))
}
export const getInstagramPosts = async (URLs: string[]) => {
const client = createApifyClient()
const { defaultDatasetId } = await client.actor('apify/instagram-scraper').call({
addParentData: false,
directUrls: URLs,
enhanceUserSearchWithFacebookPage: false,
resultsType: 'details',
searchType: 'user',
})
const { items } = await client.dataset(defaultDatasetId).listItems()
return items.map((x: any) => ({
url: x.url,
caption: x.caption,
username: x.ownerUsername,
thumbnail: x.displayUrl,
timestamp: x.timestamp,
}))
}
export const getInstagramFollowing = async (usernames: string[]) => {
try {
console.log(`[INSTAGRAM] Starting following scraper for ${usernames.length} users: ${usernames.join(', ')}`)
const client = createApifyClient()
console.log(`[INSTAGRAM] Calling louisdeconinck/instagram-following-scraper actor...`)
const startTime = Date.now()
const { defaultDatasetId } = await client
.actor('louisdeconinck/instagram-following-scraper')
.call({ usernames })
const callDuration = Date.now() - startTime
console.log(`[INSTAGRAM] Actor call completed in ${callDuration}ms, datasetId: ${defaultDatasetId}`)
console.log(`[INSTAGRAM] Fetching results from dataset...`)
const { items } = await client.dataset(defaultDatasetId).listItems()
const totalDuration = Date.now() - startTime
console.log(`[INSTAGRAM] Following scraping completed in ${totalDuration}ms, got ${items.length} items`)
return items.map((x: any) => ({
username: x.username,
fullName: x.full_name,
followedBy: x.followed_by,
profilePicUrl: x.profile_pic_url,
isVerified: x.is_verified || false,
pk: x.pk,
}))
} catch (error) {
console.error('[INSTAGRAM] Following list scraping failed:', error)
return []
}
}
import 'dotenv/config'
import { searchPostsByQuery } from './clients/google-search'
import { getInstagramPosts } from './clients/instagram'
import { processBatch } from './utils/batch'
import { uploadImageToS3 } from './clients/s3'
import { createDbClient } from './clients/db'
import { generateText } from 'ai'
import { openai } from '@ai-sdk/openai'
import dayjs from 'dayjs'
// const client = await createDbClient()
function validateAndNormalizeInstagramUrl(url: string): string | null {
if (!url) return null
try {
let normalizedUrl = url.trim()
if (normalizedUrl.startsWith('http://')) {
normalizedUrl = normalizedUrl.replace('http://', 'https://')
}
if (!normalizedUrl.startsWith('https://')) {
normalizedUrl = `https://${normalizedUrl}`
}
const urlObj = new URL(normalizedUrl)
if (!urlObj.hostname.includes('instagram.com')) {
return null
}
if (!urlObj.hostname.match(/^(www\.)?instagram\.com$/)) {
return null
}
if (!urlObj.pathname || urlObj.pathname === '/') {
return null
}
return normalizedUrl
} catch (e) {
return null
}
}
async function scrapePosts(query: string) {
console.log(`[${query}] 포스트 스크래핑 시작`)
const queries = await searchPostsByQuery({
query,
maxPagesPerQuery: 100,
resultsPerPage: 100,
afterDate: '2023-01-01',
beforeDate: '2023-12-31',
// afterDate: dayjs().subtract(1, 'day').format('YYYY-MM-DD'),
// beforeDate: dayjs().subtract(30, 'day').format('YYYY-MM-DD'),
})
console.log(`[${query}] 포스트 스크래핑 완료`, queries.length)
const allUrls = queries.flatMap((x: any) => x.url).filter((x: any) => x.includes('instagram.com/p'))
const validUrls = allUrls.map(validateAndNormalizeInstagramUrl).filter((url): url is string => url !== null)
console.log(`[${query}] URL 필터링 완료: ${allUrls.length} -> ${validUrls.length}`)
const rawPosts = await getInstagramPosts([...new Set(validUrls)])
console.log(`[${query}] 포스트 디테일 정보 가져옴`, rawPosts.length)
const posts = await processBatch(
rawPosts.filter((x: any) => !!(x.caption && x.username && x.url)),
async x => {
const [thumbnail, rawCategory] = await Promise.all([
uploadImageToS3(x.thumbnail).catch(() => x.thumbnail),
generateText({
model: openai('gpt-5'),
system: `
<goal>
당신은 공동구매 게시물 분류 전문가입니다. 캡션 정보를 확인하여, 지침에 따라 분류하세요. 출력 형식에 맞추세요.
</goal>
<instruction>
- 브랜드명 (못 찾으면 null)
- 상품명 (못 찾으면 null)
- 카테고리 (못 찾으면 null)
</instruction>
<output>
JSON 형식으로 출력하세요.
{
"brand": "브랜드명",
"productName": "상품명",
"category": "카테고리"
}
</output>
`,
prompt: `
${x.caption}
`,
}),
])
let metadata = {}
try {
// JSON 문자열 정리
const cleanedText = rawCategory.text.trim().replace(/[\u0000-\u001F\u007F-\u009F]/g, '')
metadata = JSON.parse(cleanedText)
// metadata가 객체가 아닌 경우 기본값 사용
if (typeof metadata !== 'object' || metadata === null) {
metadata = {}
}
} catch (e) {
console.error(`[${query}] JSON 파싱 실패:`, rawCategory.text)
metadata = {}
}
return {
...x,
thumbnail,
metadata,
}
},
{ batchSize: 100, concurrency: 5 },
).then(posts => posts.filter((x: any) => !!(x.metadata.brand && x.metadata.productName)))
console.log(`[${query}] 포스트 공구 정보 추출 완료`, posts.length)
const client = await createDbClient()
const values = posts.map(x => [
x.username,
x.url,
x.caption,
(x.metadata as any)?.brand || null,
query,
(x.metadata as any)?.productName || null,
JSON.stringify(x.metadata),
x.timestamp,
x.thumbnail,
])
if (values.length > 0) {
const placeholders = values
.map(
(_, i) =>
`($${i * 9 + 1}, $${i * 9 + 2}, $${i * 9 + 3}, $${i * 9 + 4}, $${i * 9 + 5}, $${i * 9 + 6}, $${i * 9 + 7}, $${
i * 9 + 8
}, $${i * 9 + 9})`,
)
.join(', ')
try {
await client.query(
`INSERT INTO temp_posts (username, post_url, content, brand, keyword, product_name, metadata, posted_at, thumbnail) VALUES ${placeholders}`,
values.flat(),
)
console.log(`[${query}] 포스트 데이터베이스 삽입 완료`, posts.length)
} catch (e) {
console.error(`[${query}] 포스트 데이터베이스 삽입 실패`, e)
}
}
console.log(`[${query}] 포스트 파이프라인 완료`)
}
await Promise.all(['아빠차트 공구'].map(x => scrapePosts(x))).then(() => {
console.log('전체 파이프라인 완료')
})
import { S3Client, PutObjectCommand } from '@aws-sdk/client-s3'
import { v4 as uuidv4 } from 'uuid'
import axios from 'axios'
const s3Client = new S3Client({
region: process.env.AWS_REGION || 'ap-northeast-2',
})
const BUCKET_NAME = 'img.zvzo.shop'
const TEMP_CREATOR_FOLDER = 'temp-match-growth'
export async function uploadImageToS3(url: string, contentType: string = 'image/jpeg'): Promise<string> {
const response = await axios.get(url, { responseType: 'arraybuffer' })
const buffer = Buffer.from(response.data)
const fileExtension = contentType.split('/')[1] || 'jpg'
const fileName = `${uuidv4()}.${fileExtension}`
const key = `${TEMP_CREATOR_FOLDER}/${fileName}`
const uploadParams = {
Bucket: BUCKET_NAME,
Key: key,
Body: buffer,
ContentType: contentType,
}
const command = new PutObjectCommand(uploadParams)
await s3Client.send(command)
return `https://${BUCKET_NAME}/${key}`
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment