Skip to content

Instantly share code, notes, and snippets.

@denrad
Created March 11, 2023 07:12
Show Gist options
  • Select an option

  • Save denrad/1c534163793c8f9890825b5a9115159f to your computer and use it in GitHub Desktop.

Select an option

Save denrad/1c534163793c8f9890825b5a9115159f to your computer and use it in GitHub Desktop.
import requests
from bs4 import BeautifulSoup
import datetime
import pytz
import feedgen.feed
from feedgen.entry import FeedEntry
# URL to scrape
url = 'https://gcheb.cap.ru/news/?type=news'
# Make a request to the URL and get the HTML content
response = requests.get(url)
html_content = response.text
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
# Create a new RSS feed using feedgen
feed = feedgen.feed.FeedGenerator()
feed.title('News from https://gcheb.cap.ru/news/?type=news')
feed.link(href=url)
feed.description('Latest news from the Cheboksary City Government website')
# Find all news items on the page
news_items = soup.find_all('div', {'class': 'item_news'})
# Loop through each news item and create an RSS feed entry
for item in news_items:
# Get the title, link and date of the news item
title = item.find('a').get_text()
link = item.find('a', {'class': 'news-list_title'})['href']
link = 'https://gcheb.cap.ru' + link
date = item.find('div', {'class': 'news-list_date'}).text.strip()
# Convert the date string to a datetime object and set the timezone
tz = pytz.timezone('Europe/Moscow')
dt = datetime.datetime.strptime(date, '%H:%M | %d.%m.%Y').replace(tzinfo=tz)
# Make a request to the news article URL and get the HTML content
response = requests.get(link)
html_content = response.text
# Parse the HTML content using BeautifulSoup
article_soup = BeautifulSoup(html_content, 'html.parser')
# Get the content of the news article
content_div = article_soup.find('div', {'class': 'news_text'})
content = content_div.text.strip()
# Create a new RSS feed entry
entry = FeedEntry()
entry.title(title)
entry.link(href=link)
entry.pubDate(dt)
entry.description(content)
# Add the entry to the feed
feed.add_entry(entry)
# Generate the RSS feed and save it to a file
feed.rss_file('news.rss')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment