Created
March 11, 2023 07:12
-
-
Save denrad/1c534163793c8f9890825b5a9115159f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| from bs4 import BeautifulSoup | |
| import datetime | |
| import pytz | |
| import feedgen.feed | |
| from feedgen.entry import FeedEntry | |
| # URL to scrape | |
| url = 'https://gcheb.cap.ru/news/?type=news' | |
| # Make a request to the URL and get the HTML content | |
| response = requests.get(url) | |
| html_content = response.text | |
| # Parse the HTML content using BeautifulSoup | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| # Create a new RSS feed using feedgen | |
| feed = feedgen.feed.FeedGenerator() | |
| feed.title('News from https://gcheb.cap.ru/news/?type=news') | |
| feed.link(href=url) | |
| feed.description('Latest news from the Cheboksary City Government website') | |
| # Find all news items on the page | |
| news_items = soup.find_all('div', {'class': 'item_news'}) | |
| # Loop through each news item and create an RSS feed entry | |
| for item in news_items: | |
| # Get the title, link and date of the news item | |
| title = item.find('a').get_text() | |
| link = item.find('a', {'class': 'news-list_title'})['href'] | |
| link = 'https://gcheb.cap.ru' + link | |
| date = item.find('div', {'class': 'news-list_date'}).text.strip() | |
| # Convert the date string to a datetime object and set the timezone | |
| tz = pytz.timezone('Europe/Moscow') | |
| dt = datetime.datetime.strptime(date, '%H:%M | %d.%m.%Y').replace(tzinfo=tz) | |
| # Make a request to the news article URL and get the HTML content | |
| response = requests.get(link) | |
| html_content = response.text | |
| # Parse the HTML content using BeautifulSoup | |
| article_soup = BeautifulSoup(html_content, 'html.parser') | |
| # Get the content of the news article | |
| content_div = article_soup.find('div', {'class': 'news_text'}) | |
| content = content_div.text.strip() | |
| # Create a new RSS feed entry | |
| entry = FeedEntry() | |
| entry.title(title) | |
| entry.link(href=link) | |
| entry.pubDate(dt) | |
| entry.description(content) | |
| # Add the entry to the feed | |
| feed.add_entry(entry) | |
| # Generate the RSS feed and save it to a file | |
| feed.rss_file('news.rss') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment