https://tmr.js.org/p/5b094510/
req:
https://quiet-wave-55460.herokuapp.com/title?url=https://tmr.js.org
resp:
{
"data": ["xxxx","xxxx"]
}
https://tmr.js.org/p/5b094510/
req:
https://quiet-wave-55460.herokuapp.com/title?url=https://tmr.js.org
resp:
{
"data": ["xxxx","xxxx"]
}
| import requests | |
| from bs4 import BeautifulSoup | |
| from collections import Counter | |
| import jieba | |
| import jieba.analyse | |
| def get_titles(url): | |
| headers = { | |
| 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36', | |
| 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', | |
| 'sec-fetch-site': 'none', | |
| 'sec-fetch-mode': 'navigate', | |
| 'sec-fetch-dest': 'document', | |
| 'accept-language': 'zh-CN,zh;q=0.9' | |
| } | |
| result = [] | |
| try: | |
| response = requests.request("GET", url, headers=headers, timeout=5) | |
| # 收集标题 | |
| if response.ok: | |
| response.encoding = response.apparent_encoding | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| titles = [] | |
| # 遍历tag | |
| tags = ["title", "h1", "h2"] | |
| for tag in tags: | |
| items = soup.find_all(tag) | |
| # 清理,提取文本 | |
| texts = [] | |
| for item in items: | |
| text = clear_text(item.text) | |
| if text: | |
| texts.append(text) | |
| # 取前2个 | |
| for text in texts[:2]: | |
| titles.append(text) | |
| # 遍历class | |
| classes = ["title", "content-title", | |
| "DailyHeader-title", "question-title"] | |
| for c in classes: | |
| items = soup.find_all(attrs={"class": c}) | |
| # 清理,提取文本 | |
| texts = [] | |
| for item in items: | |
| text = clear_text(item.text) | |
| if text: | |
| texts.append(text) | |
| # 取前2个 | |
| for text in texts[:2]: | |
| titles.append(text) | |
| print("all titles: ", titles) | |
| titles = list(set(titles)) | |
| if len(titles) <= 1: | |
| return titles | |
| # 找出最适合的标题 | |
| # 拆关键字top3 | |
| top_tag = jieba.analyse.extract_tags("\n".join(titles), topK=3) | |
| print("top_tag: ", top_tag) | |
| # 遍历关键字命中率 | |
| counter = Counter() | |
| for title in set(titles): | |
| for t in top_tag: | |
| if t in title: | |
| counter[title] += 1 | |
| print("counter: ", counter) | |
| # 取top2 | |
| top_title = counter.most_common(2) | |
| length = len(top_title) | |
| if length == 2: | |
| # 命中次数相同,且有前后缀关系,取最短的 | |
| # if top_title[0][1] == top_title[1][1] and (top_title[0][0] in top_title[1][0] or top_title[1][0] in top_title[0][0]): | |
| if top_title[0][1] == top_title[1][1] and (top_title[0][0].startswith(top_title[1][0]) or top_title[0][0].endswith(top_title[1][0]) or top_title[1][0].startswith(top_title[0][0]) or top_title[1][0].endswith(top_title[0][0])): | |
| result.append(min(top_title, key=lambda x: len(x[0]))[0]) | |
| else: | |
| for title in top_title: | |
| result.append(title[0]) | |
| elif length == 1: | |
| result.append(top_title[0][0]) | |
| except Exception as e: | |
| print(e) | |
| return result |
| beautifulsoup4==4.9.3 | |
| bs4==0.0.1 | |
| requests==2.25.1 | |
| jieba==0.42.1 |