Created
June 8, 2021 07:51
-
-
Save foxy4096/41909f56d3366bc93a638efcf78bde99 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from os import name | |
| import requests | |
| from requests import get | |
| import pandas as pd | |
| from bs4 import BeautifulSoup | |
| import numpy as np | |
| header = {"Accept-Language": "en-us, en;q=0.5"} | |
| url = "https://www.imdb.com/search/title/?groups=top_1000&ref_=adv_prv" | |
| results = requests.get(url, headers=header) | |
| soup = BeautifulSoup(results.text, "html.parser") | |
| titles = [] | |
| years = [] | |
| time = [] | |
| ratings = [] | |
| scores = [] | |
| votes = [] | |
| gross = [] | |
| genres = [] | |
| movie_div = soup.find_all('div', class_="lister-item mode-advanced") | |
| for container in movie_div: | |
| name = container.h3.a.text | |
| titles.append(name) | |
| year = container.h3.find('span', class_="lister-item-year").text | |
| years.append(year) | |
| runtime = container.p.find('span', class_='runtime').text if container.p.find('span', class_='runtime').text else '-' | |
| imdb = float(container.strong.text) | |
| ratings.append(imdb) | |
| #metascore | |
| m_score = container.find('span', class_='metascore').text if container.find('span', class_='metascore') else '-' | |
| scores.append(m_score) | |
| #there are two NV containers, grab both of them as they hold both the votes and the grosses | |
| nv = container.find_all('span', attrs={'name': 'nv'}) | |
| genre = container.p.find('span', class_="genre").text | |
| genres.append(genre) | |
| #filter nv for votes | |
| vote = nv[0].text | |
| votes.append(vote) | |
| #filter nv for gross | |
| grosses = nv[1].text if len(nv) > 1 else '-' | |
| gross.append(grosses) | |
| movies = pd.DataFrame({ | |
| 'movie': titles, | |
| 'year': years, | |
| # 'genre': genres, | |
| 'timeMin': runtime, | |
| 'imdb': ratings, | |
| 'metascore': scores, | |
| 'votes': votes, | |
| 'us_grossMillions': gross, | |
| }) | |
| print(movies) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment