-
-
Save xflr6/759737dc06b290a009352d3307782a2b to your computer and use it in GitHub Desktop.
| """Download all available audio books from DB ICE Portal.""" | |
| import json | |
| import os | |
| import urllib.parse | |
| import urllib.request | |
| BASE = 'http://iceportal.de/api1/rs/' | |
| def load_json(url: str, *, verbose: bool = True): | |
| if verbose: | |
| print(url) | |
| with urllib.request.urlopen(url) as f: | |
| doc = json.load(f) | |
| return doc | |
| def get_page(href: str, *, | |
| base: str = urllib.parse.urljoin(BASE, 'page/')): | |
| url = urllib.parse.urljoin(base, href.lstrip('/')) | |
| return load_json(url) | |
| def retrieve(source, target, *, | |
| base: str = urllib.parse.urljoin(BASE, 'audiobooks/path/')) -> None: | |
| sheet = urllib.parse.urljoin(base, source.lstrip('/')) | |
| path = load_json(sheet)['path'] | |
| url = urllib.parse.urljoin(base, path) | |
| urllib.request.urlretrieve(url, filename=target) | |
| audiobooks = get_page('hoerbuecher') | |
| for group in audiobooks['teaserGroups']: | |
| for item in group['items']: | |
| print('', item['title'], sep='\n') | |
| page = get_page(item['navigation']['href']) | |
| dirname = page['title'] | |
| # fix invalid | |
| dirname = dirname.replace('.', '_') | |
| for remove_char in ('"', '?', '&', '/', '|'): | |
| dirname = dirname.replace(remove_char, '') | |
| dirname, _, _ = dirname.partition(':') | |
| if not os.path.exists(dirname): | |
| os.makedirs(dirname) | |
| for file in page['files']: | |
| url = file['path'] | |
| target = os.path.join(dirname, | |
| '{:d} - {}'.format(file['serialNumber'], | |
| url.rpartition('/')[2])) | |
| if not os.path.exists(target): | |
| retrieve(url, target) |
Hi! i used this script yesterday, it works for quite a while.
But then i saw the behaviour that a file was downloaded, it's size was shrinked to zero, redownloaded, shrinked to zero, redownloaded, .... . It was more or less an endless loop until the wifi connection itself got lost.
Then i debugged and saw that this behaviour was in the line urllib.request.urlretrieve(url, filename=target) in the retrieve-function.
Did someone else saw this behaviour and/or has an idea how to stop that?
Could it be that the urlretrieve got a redirect while it's loading, does a redownload, got a redirect, does a redownload and so on?
Is there a parameter for this function which would trigger to ignore such redirects/redownloads, or an other internal function which does more or less the same?
I would be happy if this urlretrieve would throw an expection/returns with an error code if this happens, so script could catch that and download the remaining files.
Thanks @contrequarte, adapted so that the file names now always start with the serial number.