allieus · June 14, 2019 01:19 · allieus · Mar 5, 2017 · ymkim92 · Jun 14, 2019
diff --git a/crawl-팟빵-뉴스공장.py b/crawl-팟빵-뉴스공장.py
 import re
 import requests
 from itertools import count
 from pathlib import Path
 from bs4 import BeautifulSoup
 from clint.textui import progress


 def get_list(pid):
    for page in count(1):
        print('try page {}'.format(page))
        url = 'http://www.podbbang.com/podbbangchnew/episode_list?id={pid}&page={page}'.format(pid=pid, page=page)

        response = requests.get(url)
        response.encoding = 'utf8'
        html = response.text

        # soup = BeautifulSoup(html, 'html.parser')
        soup = BeautifulSoup(html, 'lxml')

        for dl_tag in soup.select('li > dl'):
            try:
                title = dl_tag.find('dt')['title']
                js = dl_tag['onclick']
                matched = re.search(r"'(\d+)',\s*'(\w+/\w+)'", js)
                if matched:
                    eid, content_type = matched.groups()
                    mp3_download(pid, eid, title)
            except KeyError:
                print('Ended')
                return None

        line = input('{page} 페이지를 시도할까요? (Y/n) '.format(page=page+1)).strip().lower()
        if line and not line.startswith('y'):
            print('Ended')
            break


 def mp3_download(pid, eid, title):
    url = 'http://www.podbbang.com/download?pid={pid}&eid={eid}'.format(pid=pid, eid=eid)

    headers = {
        'Referer': 'http://www.podbbang.com/ch/{pid}'.format(pid=pid),
    }
    r = requests.get(url, headers=headers, stream=True)

    if r.status_code == 200:
        filepath = Path('{}.mp3'.format(title))
        total_length = int(r.headers.get('content-length'))

        if filepath.exists() and filepath.stat().st_size == total_length:
            print('{} - 이미 다운받았습니다.'.format(title))
        else:
            print('{} - 다운로드'.format(title))
            with filepath.open('wb') as f:
                chunk_size = 1024
                expected_size = (total_length//chunk_size) + 1

                for chunk in progress.bar(r.iter_content(chunk_size=chunk_size), expected_size=expected_size):
                    f.write(chunk)
    else:
        print('download failed. status code = {}'.format(r.status_code))


 if __name__ == '__main__':
    get_list('12548')
diff --git a/requirements.txt b/requirements.txt
 beautifulsoup4
 requests
 clint
	import re
	import requests
	from itertools import count
	from pathlib import Path
	from bs4 import BeautifulSoup
	from clint.textui import progress


	def get_list(pid):
	for page in count(1):
	print('try page {}'.format(page))
	url = 'http://www.podbbang.com/podbbangchnew/episode_list?id={pid}&page={page}'.format(pid=pid, page=page)

	response = requests.get(url)
	response.encoding = 'utf8'
	html = response.text

	# soup = BeautifulSoup(html, 'html.parser')
	soup = BeautifulSoup(html, 'lxml')

	for dl_tag in soup.select('li > dl'):
	try:
	title = dl_tag.find('dt')['title']
	js = dl_tag['onclick']
	matched = re.search(r"'(\d+)',\s*'(\w+/\w+)'", js)
	if matched:
	eid, content_type = matched.groups()
	mp3_download(pid, eid, title)
	except KeyError:
	print('Ended')
	return None

	line = input('{page} 페이지를 시도할까요? (Y/n) '.format(page=page+1)).strip().lower()
	if line and not line.startswith('y'):
	print('Ended')
	break


	def mp3_download(pid, eid, title):
	url = 'http://www.podbbang.com/download?pid={pid}&eid={eid}'.format(pid=pid, eid=eid)

	headers = {
	'Referer': 'http://www.podbbang.com/ch/{pid}'.format(pid=pid),
	}
	r = requests.get(url, headers=headers, stream=True)

	if r.status_code == 200:
	filepath = Path('{}.mp3'.format(title))
	total_length = int(r.headers.get('content-length'))

	if filepath.exists() and filepath.stat().st_size == total_length:
	print('{} - 이미 다운받았습니다.'.format(title))
	else:
	print('{} - 다운로드'.format(title))
	with filepath.open('wb') as f:
	chunk_size = 1024
	expected_size = (total_length//chunk_size) + 1

	for chunk in progress.bar(r.iter_content(chunk_size=chunk_size), expected_size=expected_size):
	f.write(chunk)
	else:
	print('download failed. status code = {}'.format(r.status_code))


	if __name__ == '__main__':
	get_list('12548')
No results found