Created
September 26, 2025 08:42
-
-
Save Jungwoo-An/eac5ef0f191fae1d441a8a192b0cda49 to your computer and use it in GitHub Desktop.
test
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import dayjs from 'dayjs' | |
| import { createApifyClient } from './apify' | |
| export const searchPostsByQuery = async ({ | |
| query, | |
| afterDate, | |
| beforeDate, | |
| maxPagesPerQuery = 10, | |
| resultsPerPage = 20, | |
| }: { | |
| query: string | |
| maxPagesPerQuery: number | |
| resultsPerPage: number | |
| afterDate?: string | |
| beforeDate?: string | |
| }) => { | |
| const client = createApifyClient() | |
| const { defaultDatasetId } = await client.actor('apify/google-search-scraper').call({ | |
| focusOnPaidAds: false, | |
| forceExactMatch: false, | |
| includeIcons: false, | |
| ...(beforeDate && { beforeDate }), | |
| ...(afterDate && { afterDate }), | |
| includeUnfilteredResults: false, | |
| maxPagesPerQuery, | |
| maximumLeadsEnrichmentRecords: 0, | |
| mobileResults: false, | |
| queries: `${query} site:instagram.com`, | |
| resultsPerPage, | |
| saveHtml: false, | |
| saveHtmlToKeyValueStore: true, | |
| aiMode: 'aiModeOff', | |
| searchLanguage: '', | |
| languageCode: '', | |
| wordsInTitle: [], | |
| wordsInText: [], | |
| wordsInUrl: [], | |
| }) | |
| const { items } = await client.dataset(defaultDatasetId).listItems() | |
| return items.flatMap(x => x.organicResults) | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import { createApifyClient } from './apify' | |
| export const getInstagramProfiles = async (usernames: string[]) => { | |
| const client = createApifyClient() | |
| const { defaultDatasetId } = await client.actor('apify/instagram-profile-scraper').call({ | |
| usernames, | |
| }) | |
| const { items } = await client.dataset(defaultDatasetId).listItems() | |
| return items.map((x: any) => ({ | |
| username: x.username, | |
| fullName: x.fullName, | |
| biography: x.biography, | |
| followersCount: x.followersCount, | |
| profileURL: x.profilePicUrlHD, | |
| })) | |
| } | |
| export const getInstagramPosts = async (URLs: string[]) => { | |
| const client = createApifyClient() | |
| const { defaultDatasetId } = await client.actor('apify/instagram-scraper').call({ | |
| addParentData: false, | |
| directUrls: URLs, | |
| enhanceUserSearchWithFacebookPage: false, | |
| resultsType: 'details', | |
| searchType: 'user', | |
| }) | |
| const { items } = await client.dataset(defaultDatasetId).listItems() | |
| return items.map((x: any) => ({ | |
| url: x.url, | |
| caption: x.caption, | |
| username: x.ownerUsername, | |
| thumbnail: x.displayUrl, | |
| timestamp: x.timestamp, | |
| })) | |
| } | |
| export const getInstagramFollowing = async (usernames: string[]) => { | |
| try { | |
| console.log(`[INSTAGRAM] Starting following scraper for ${usernames.length} users: ${usernames.join(', ')}`) | |
| const client = createApifyClient() | |
| console.log(`[INSTAGRAM] Calling louisdeconinck/instagram-following-scraper actor...`) | |
| const startTime = Date.now() | |
| const { defaultDatasetId } = await client | |
| .actor('louisdeconinck/instagram-following-scraper') | |
| .call({ usernames }) | |
| const callDuration = Date.now() - startTime | |
| console.log(`[INSTAGRAM] Actor call completed in ${callDuration}ms, datasetId: ${defaultDatasetId}`) | |
| console.log(`[INSTAGRAM] Fetching results from dataset...`) | |
| const { items } = await client.dataset(defaultDatasetId).listItems() | |
| const totalDuration = Date.now() - startTime | |
| console.log(`[INSTAGRAM] Following scraping completed in ${totalDuration}ms, got ${items.length} items`) | |
| return items.map((x: any) => ({ | |
| username: x.username, | |
| fullName: x.full_name, | |
| followedBy: x.followed_by, | |
| profilePicUrl: x.profile_pic_url, | |
| isVerified: x.is_verified || false, | |
| pk: x.pk, | |
| })) | |
| } catch (error) { | |
| console.error('[INSTAGRAM] Following list scraping failed:', error) | |
| return [] | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import 'dotenv/config' | |
| import { searchPostsByQuery } from './clients/google-search' | |
| import { getInstagramPosts } from './clients/instagram' | |
| import { processBatch } from './utils/batch' | |
| import { uploadImageToS3 } from './clients/s3' | |
| import { createDbClient } from './clients/db' | |
| import { generateText } from 'ai' | |
| import { openai } from '@ai-sdk/openai' | |
| import dayjs from 'dayjs' | |
| // const client = await createDbClient() | |
| function validateAndNormalizeInstagramUrl(url: string): string | null { | |
| if (!url) return null | |
| try { | |
| let normalizedUrl = url.trim() | |
| if (normalizedUrl.startsWith('http://')) { | |
| normalizedUrl = normalizedUrl.replace('http://', 'https://') | |
| } | |
| if (!normalizedUrl.startsWith('https://')) { | |
| normalizedUrl = `https://${normalizedUrl}` | |
| } | |
| const urlObj = new URL(normalizedUrl) | |
| if (!urlObj.hostname.includes('instagram.com')) { | |
| return null | |
| } | |
| if (!urlObj.hostname.match(/^(www\.)?instagram\.com$/)) { | |
| return null | |
| } | |
| if (!urlObj.pathname || urlObj.pathname === '/') { | |
| return null | |
| } | |
| return normalizedUrl | |
| } catch (e) { | |
| return null | |
| } | |
| } | |
| async function scrapePosts(query: string) { | |
| console.log(`[${query}] 포스트 스크래핑 시작`) | |
| const queries = await searchPostsByQuery({ | |
| query, | |
| maxPagesPerQuery: 100, | |
| resultsPerPage: 100, | |
| afterDate: '2023-01-01', | |
| beforeDate: '2023-12-31', | |
| // afterDate: dayjs().subtract(1, 'day').format('YYYY-MM-DD'), | |
| // beforeDate: dayjs().subtract(30, 'day').format('YYYY-MM-DD'), | |
| }) | |
| console.log(`[${query}] 포스트 스크래핑 완료`, queries.length) | |
| const allUrls = queries.flatMap((x: any) => x.url).filter((x: any) => x.includes('instagram.com/p')) | |
| const validUrls = allUrls.map(validateAndNormalizeInstagramUrl).filter((url): url is string => url !== null) | |
| console.log(`[${query}] URL 필터링 완료: ${allUrls.length} -> ${validUrls.length}`) | |
| const rawPosts = await getInstagramPosts([...new Set(validUrls)]) | |
| console.log(`[${query}] 포스트 디테일 정보 가져옴`, rawPosts.length) | |
| const posts = await processBatch( | |
| rawPosts.filter((x: any) => !!(x.caption && x.username && x.url)), | |
| async x => { | |
| const [thumbnail, rawCategory] = await Promise.all([ | |
| uploadImageToS3(x.thumbnail).catch(() => x.thumbnail), | |
| generateText({ | |
| model: openai('gpt-5'), | |
| system: ` | |
| <goal> | |
| 당신은 공동구매 게시물 분류 전문가입니다. 캡션 정보를 확인하여, 지침에 따라 분류하세요. 출력 형식에 맞추세요. | |
| </goal> | |
| <instruction> | |
| - 브랜드명 (못 찾으면 null) | |
| - 상품명 (못 찾으면 null) | |
| - 카테고리 (못 찾으면 null) | |
| </instruction> | |
| <output> | |
| JSON 형식으로 출력하세요. | |
| { | |
| "brand": "브랜드명", | |
| "productName": "상품명", | |
| "category": "카테고리" | |
| } | |
| </output> | |
| `, | |
| prompt: ` | |
| ${x.caption} | |
| `, | |
| }), | |
| ]) | |
| let metadata = {} | |
| try { | |
| // JSON 문자열 정리 | |
| const cleanedText = rawCategory.text.trim().replace(/[\u0000-\u001F\u007F-\u009F]/g, '') | |
| metadata = JSON.parse(cleanedText) | |
| // metadata가 객체가 아닌 경우 기본값 사용 | |
| if (typeof metadata !== 'object' || metadata === null) { | |
| metadata = {} | |
| } | |
| } catch (e) { | |
| console.error(`[${query}] JSON 파싱 실패:`, rawCategory.text) | |
| metadata = {} | |
| } | |
| return { | |
| ...x, | |
| thumbnail, | |
| metadata, | |
| } | |
| }, | |
| { batchSize: 100, concurrency: 5 }, | |
| ).then(posts => posts.filter((x: any) => !!(x.metadata.brand && x.metadata.productName))) | |
| console.log(`[${query}] 포스트 공구 정보 추출 완료`, posts.length) | |
| const client = await createDbClient() | |
| const values = posts.map(x => [ | |
| x.username, | |
| x.url, | |
| x.caption, | |
| (x.metadata as any)?.brand || null, | |
| query, | |
| (x.metadata as any)?.productName || null, | |
| JSON.stringify(x.metadata), | |
| x.timestamp, | |
| x.thumbnail, | |
| ]) | |
| if (values.length > 0) { | |
| const placeholders = values | |
| .map( | |
| (_, i) => | |
| `($${i * 9 + 1}, $${i * 9 + 2}, $${i * 9 + 3}, $${i * 9 + 4}, $${i * 9 + 5}, $${i * 9 + 6}, $${i * 9 + 7}, $${ | |
| i * 9 + 8 | |
| }, $${i * 9 + 9})`, | |
| ) | |
| .join(', ') | |
| try { | |
| await client.query( | |
| `INSERT INTO temp_posts (username, post_url, content, brand, keyword, product_name, metadata, posted_at, thumbnail) VALUES ${placeholders}`, | |
| values.flat(), | |
| ) | |
| console.log(`[${query}] 포스트 데이터베이스 삽입 완료`, posts.length) | |
| } catch (e) { | |
| console.error(`[${query}] 포스트 데이터베이스 삽입 실패`, e) | |
| } | |
| } | |
| console.log(`[${query}] 포스트 파이프라인 완료`) | |
| } | |
| await Promise.all(['아빠차트 공구'].map(x => scrapePosts(x))).then(() => { | |
| console.log('전체 파이프라인 완료') | |
| }) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import { S3Client, PutObjectCommand } from '@aws-sdk/client-s3' | |
| import { v4 as uuidv4 } from 'uuid' | |
| import axios from 'axios' | |
| const s3Client = new S3Client({ | |
| region: process.env.AWS_REGION || 'ap-northeast-2', | |
| }) | |
| const BUCKET_NAME = 'img.zvzo.shop' | |
| const TEMP_CREATOR_FOLDER = 'temp-match-growth' | |
| export async function uploadImageToS3(url: string, contentType: string = 'image/jpeg'): Promise<string> { | |
| const response = await axios.get(url, { responseType: 'arraybuffer' }) | |
| const buffer = Buffer.from(response.data) | |
| const fileExtension = contentType.split('/')[1] || 'jpg' | |
| const fileName = `${uuidv4()}.${fileExtension}` | |
| const key = `${TEMP_CREATOR_FOLDER}/${fileName}` | |
| const uploadParams = { | |
| Bucket: BUCKET_NAME, | |
| Key: key, | |
| Body: buffer, | |
| ContentType: contentType, | |
| } | |
| const command = new PutObjectCommand(uploadParams) | |
| await s3Client.send(command) | |
| return `https://${BUCKET_NAME}/${key}` | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment