Created
November 24, 2016 02:14
-
-
Save dspp779/f2ce215aa9da014f66f6476a882cc8b8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Requirement already satisfied (use --upgrade to upgrade): selenium in /usr/local/lib/python3.5/site-packages\n", | |
| "\u001b[33mYou are using pip version 8.1.2, however version 9.0.0 is available.\n", | |
| "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!pip3 install selenium" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "!brew cask install google-chrome" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "!brew install chromedriver" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from selenium import webdriver\n", | |
| "driver = webdriver.Chrome()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 22, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "driver.get(\"http://bulletin.web.nthu.edu.tw/files/40-1912-5083-1.php?Lang=zh-tw\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "elems = driver.find_elements_by_css_selector(\"div.h5\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 24, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "[ 2016-11-10 ] 【學術午餐討論會】 清大生倫中心將於105年11月至12月間舉辦三場學術午餐討論會,本次主題為【訂作一段生命的奇幻旅程-談基因與再生醫學時代的出生到終老】,歡迎各界踴躍報名參加\n", | |
| "[ 2016-11-07 ] 105/11/29晚上科技人文講座六:工業基礎技術 決戰 【亞洲.矽谷】等你來參加!! 名額有限,敬請把握機會。\n", | |
| "[ 2016-11-02 ] 【講座資訊】2016孫運璿科技講座演講\n", | |
| "[ 2016-11-02 ] 11/21學習系列講座-口語表達技巧,請踴躍報名參加\n", | |
| "[ 2016-11-01 ] 歡迎參加11月24日「國立清華大學書畫藏品所見日治台灣裝裱」演講\n", | |
| "[ 2016-10-26 ] 【活水講堂】無國界醫生前線參與──我在史瓦濟蘭的7個月,11月19日(六)14:00-16:00 開講,歡迎參加!\n", | |
| "[ 2016-10-04 ] 玄奘大學舉辦105學年度「桃竹苗美學列車」於10月跨校巡迴演出\n", | |
| "[ 2016-09-19 ] 不散.夜貓子電影院十週年精選影展\n", | |
| "[ 2016-09-09 ] 9月5日(一)起,校史微展┴〈你所不知道的清知識〉之「清華人是有練過的」、「清華好free」篇登場!\n", | |
| "[ 2016-09-09 ] [書展] 科技部補助圖書計畫書展:精選美學、古希臘哲學經典書籍,內容豐富多元,歡迎參觀!\n", | |
| "[ 2016-08-11 ] 8/27(六)曾晴賢教授演講-「扭轉巨變中的台灣生態----以淡水生態為例」\n", | |
| "[ 2016-08-03 ] 一起來抓戀愛bug!「活水講堂」:你還愛我嗎?五個愛情裡的思考謬誤。8月20日(六)14:00-16:00開講\n", | |
| "[ 2016-07-05 ] 【轉知】「高雄市81氣爆紀念裝置藝術計畫」公開徵選\n", | |
| "[ 2016-07-01 ] 「105年電影藝術分享計畫-螢火蟲電影院」7/9 活動開跑,清大也有場次!\n", | |
| "[ 2016-07-01 ] 【轉知藝文活動快訊】 北投異托邦藝術計畫講座暨工作坊\n", | |
| "[ 2016-06-30 ] 「活水講堂」:不同凡「想」 x 不同凡響 : 發揮創意 改變人生!7月16日(六)14:00-16:00 開講,歡迎參加!\n", | |
| "[ 2016-06-30 ] 中國醫藥大學 「PBL師資培訓:教案寫作研習營」\n", | |
| "[ 2016-05-26 ] 歡迎踴躍報名參加 5/30(一)19:30~22:00 永續能源系列講座:台灣到底會不會缺電?\n", | |
| "[ 2016-05-16 ] 法鼓文理學院2016年兩岸教師研修體驗營\n", | |
| "[ 2016-05-04 ] 『看見台灣』導演齊柏林來清大演講。講題:齊柏林的飛行視界。地點:台達館景德講堂。(5月18日晚上7點)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "for elem in elems:\n", | |
| " print(elem.text)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "driver.find_elements_by_css_selector(\"div.h5\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "['http://news.ltn.com.tw/news/life/paper/1047866',\n", | |
| " 'http://news.ltn.com.tw/news/life/breakingnews/1866371',\n", | |
| " 'http://news.ltn.com.tw/news/life/breakingnews/1864088',\n", | |
| " 'http://news.ltn.com.tw/news/life/breakingnews/1861102',\n", | |
| " 'http://news.ltn.com.tw/news/life/breakingnews/1857627',\n", | |
| " 'http://news.ltn.com.tw/news/life/breakingnews/1857505',\n", | |
| " 'http://news.ltn.com.tw/news/life/paper/1042231',\n", | |
| " 'http://news.ltn.com.tw/news/life/breakingnews/1856793',\n", | |
| " 'http://news.ltn.com.tw/news/life/breakingnews/1856810']" | |
| ] | |
| }, | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "urls" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 24, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def get_news_text(url):\n", | |
| " driver.get(url)\n", | |
| " title = driver.find_elements_by_css_selector(\".content h1\")[0].text\n", | |
| " paragraphs = driver.find_elements_by_css_selector(\"#newstext p\")\n", | |
| " content = [p.text.strip()for p in paragraphs]\n", | |
| " return title, content" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 25, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "data = []\n", | |
| "for url in urls:\n", | |
| " title, content = get_news_text(url)\n", | |
| " data.append({'title': title, 'content': content})" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 26, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import json\n", | |
| "json.dump(data, open('data161104.json', 'w'), ensure_ascii=False, indent=2)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'〔記者洪美秀/竹市報導〕清華大學與新竹教育大學昨天正式合校,清華大學前學生會長徐光成抗議清大校方在處理合校案過程粗糙獨裁,從前天傍晚六點開始絕食,直到昨天已絕食超過廿四小時,僅喝水維持身體能量,清大人社所長陳明祺與同學前往關心、聲援,呼籲校方出面回應。'" | |
| ] | |
| }, | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "a.text.strip()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.5.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 1 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment