Last active
June 17, 2025 21:27
-
-
Save rvtr/1b471e5f5215c368fd78d9aba05f8dc2 to your computer and use it in GitHub Desktop.
Scrapes agenda data from london.ca
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # Hey folks, please do not run this script more than necessary. | |
| # Too many search requests will temporarily block searches for everyone, not just you. | |
| # I do not want to DDOS London. I just want to allow for personal backups. Cheers! | |
| echo -e "\n-========================================================================-" | |
| echo -e "-=- -=-" | |
| echo -e "-=- SCRAPE_MEETINGS.SH: Downloads committee videos and agendas -=-" | |
| echo -e "-=- -=-" | |
| echo -e "-=- https://gist.github.com/rvtr/1b471e5f5215c368fd78d9aba05f8dc2 -=-" | |
| echo -e "-=- Lillian Skinner (2025) -=-" | |
| echo -e "-=- -=-" | |
| echo -e "-========================================================================-" | |
| echo "Starting job: SCRAPE_MEETINGS: $(date)" | |
| # Warning to all who read this script: | |
| # It is badly written. I know it is bad, but I am tired okay, and sometimes sloppy just works. | |
| # London seems to have recently blocked unusual user agents. Can't use wget or even ping. Thankfully pretend to be a real person! | |
| WGET_UA="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Viewer/99.9.8782.87" | |
| TEMP_DIR="./tmp/" | |
| SEARCH_PAGE="./tmp/index.html" | |
| AGENDA_HTML="./tmp/work.html" | |
| #VIDEO_TIMESTAMP_JSON="./tmp/time.json" | |
| if [ -d "$TEMP_DIR" ]; then | |
| rm -r $TEMP_DIR | |
| fi | |
| rm -f $SEARCH_PAGE | |
| rm -f $AGENDA_HTML | |
| mkdir $TEMP_DIR | |
| SEARCH_URL="https://london.ca/government/council-civic-administration/council-committee-meetings/meetings" | |
| # Need to confirm. When stacking params does the type need to be f[1]? | |
| SEARCH_FORMAT_COMMITTEE="f[1]=meeting_type%3A" | |
| SEARCH_FORMAT_DATE="f[0]=meeting_date%3A" | |
| SEARCH_FORMAT_QUERY="search=query&sort_by=field_meeting_date" | |
| # As far as I'm aware there are no meetings prior to 2011. | |
| i=2011 | |
| x=$((i + 1)) | |
| echo $x | |
| SEARCH_END="FALSE" | |
| while (( i < x )); do | |
| j=0 | |
| SEARCH_END="FALSE" | |
| while [[ $SEARCH_END == "FALSE" ]]; do | |
| echo "SCRAPE_MEETINGS: Downloading search results... Page $j of $i" | |
| wget --user-agent="$WGET_UA" $SEARCH_URL"?$SEARCH_FORMAT_DATE$i&page=$j" -O $SEARCH_PAGE -q #--show-progress | |
| if [ $? -ne 8 ]; then | |
| FOUNDMEETING="FALSE" | |
| GREP404=$(cat $SEARCH_PAGE | grep "No results found.") | |
| if [[ "$GREP404" == "" ]]; then | |
| while IFS= read -r LINE; do | |
| # All meeting items in the search results are formatted like so: | |
| # - One line with the name | |
| # - Second line with all other info including links | |
| # | |
| # We can find the first line by the class "views-field-field-meeting-notes" | |
| # FOUNDMEETING=TRUE will show that the first line has been found, and so the next line read will be "confirmed" as line 2 of the meeting info | |
| # The first two links of every second line are (in order) the PDF and HTML agendas | |
| if [[ "TRUE" == $FOUNDMEETING ]]; then | |
| FOUNDMEETING="FALSE" | |
| echo "SCRAPE_MEETINGS: -========================================================================-" | |
| echo "SCRAPE_MEETINGS: Working on $MEETING_NAME ($MEETING_YEAR/$MEETING_MONTH/$MEETING_DAY)" | |
| echo "SCRAPE_MEETINGS: All files to be saved as "$MEETING_NAME"/"$MEETING_YEAR"/"$MEETING_MONTH"-"$MEETING_DAY"/" | |
| echo "SCRAPE_MEETINGS: -========================================================================-" | |
| echo "SCRAPE_MEETINGS: Task starting on: $(date)" | |
| #echo "CANCEL NOW!!!" | |
| #sleep 5 | |
| # Grab meeting item links | |
| echo $LINE | sed 's/href=./\nhref="/g' | grep 'href="https' | sed 's/.*href="\([^"]*\)".*/\1/p' | uniq > "./tmp/meeting_urls" | |
| # Grab meeting item types | |
| echo $LINE | sed 's/rel=.noreferrer.>/\nrel="noreferrer">/g' | grep 'rel="noreferrer">' | sed 's/.*rel="noreferrer">\([^<]*\)<.*/\1/p' | uniq > "./tmp/meeting_types" | |
| AGENDA_HTML_URL="" | |
| AGENDA_PDF_URL="" | |
| AGENDA_REVISE_HTML_URL="" | |
| AGENDA_REVISE_PDF_URL="" | |
| MINUTES_HTML_URL="" | |
| MINUTES_PDF_URL="" | |
| MINUTES_ATTACH_PDF_URL="" | |
| echo "SCRAPE_MEETINGS: Found the following documents:" | |
| while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do | |
| echo "SCRAPE_MEETINGS: - $LINEA2" | |
| case "$LINEA2" in | |
| "Agenda (HTML) ") | |
| AGENDA_HTML_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; | |
| "Agenda (PDF) ") | |
| AGENDA_PDF_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; | |
| "Revised Agenda (HTML) ") | |
| AGENDA_REVISE_HTML_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; | |
| "Revised Agenda (PDF) ") | |
| AGENDA_REVISE_PDF_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; | |
| "Minutes (HTML) ") | |
| MINUTES_HTML_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; | |
| "Minutes (PDF) ") | |
| MINUTES_PDF_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; | |
| "Minutes with Attachments (PDF) ") | |
| MINUTES_ATTACH_PDF_URL=$(echo $LINEA1 | sed 's/&/\&/g' | sed 's/'/'\''/g') ;; | |
| esac | |
| done < ./tmp/meeting_urls 3< ./tmp/meeting_types | |
| # Always prefer Revised Agendas | |
| echo "SCRAPE_MEETINGS: Downloading agenda HTML..." | |
| if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then | |
| wget --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O $AGENDA_HTML -q #--show-progress | |
| elif [[ $AGENDA_HTML_URL != "" ]]; then | |
| wget --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O $AGENDA_HTML -q #--show-progress | |
| else | |
| ERROR="TRUE" | |
| fi | |
| if [[ ERROR="FALSE" ]]; then | |
| mkdir "./LondonArchive" | |
| mkdir "./LondonArchive/Meetings" | |
| if [ ! -d "./LondonArchive/Meetings/$MEETING_NAME" ]; then | |
| mkdir "./LondonArchive/Meetings/$MEETING_NAME/" | |
| fi | |
| if [ ! -d "./LondonArchive/Meetings/$MEETING_NAME/$MEETING_YEAR" ]; then | |
| mkdir "./LondonArchive/Meetings/$MEETING_NAME/$MEETING_YEAR/" | |
| fi | |
| MEETING_DIR=$(printf "./LondonArchive/Meetings/%s/%s/%s-%s" "$MEETING_NAME" "$MEETING_YEAR" "$MEETING_MONTH" "$MEETING_DAY") | |
| if [ ! -d "$MEETING_DIR" ]; then | |
| mkdir "$MEETING_DIR/" | |
| fi | |
| if [ ! -d "$MEETING_DIR/Attachments" ]; then | |
| mkdir "$MEETING_DIR/Attachments/" | |
| fi | |
| # Direct video links is always "video.isilive.ca/<REGION>/<NAME>" | |
| # There are some eScribe ones, but those are in m3u8s and are really annoying to work with | |
| # ...not annoying as more sed though. | |
| VIDEO_URL=$(grep 'id="isi_player"' ./tmp/work.html | sed -n 's/.*data-stream_name="\([^"]*\)".*/\1/p' | sed 's/ /%20/g') | |
| if [[ $VIDEO_URL != "" ]]; then | |
| echo "SCRAPE_MEETINGS: Saving recording URL..." | |
| echo "https://video.isilive.ca/london/"$VIDEO_URL > "$MEETING_DIR/RecordingLink.txt" | |
| fi | |
| # Get attachment links | |
| cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/href=.filestream\.ashx/\nhref="filestream\.ashx/g' | grep 'filestream.ashx' | sed 's/. data-toggle/\" data-toggle/p' | sed 's/href=.\([^/]*\)".*/\1/p' | awk '!x[$0]++' > "./tmp/attachment_urls" | |
| # Get attachment names | |
| cat $AGENDA_HTML | grep "AgendaItemAttachment AgendaItemAttachmentNotSelected" | sed 's/data-original-title=./\ndata-original-title='\''/g' | grep 'data-original-title' | sed 's/data-original-title=.//p' | sed 's/.pdf['\'':"].*/.pdf/g' | awk '!x[$0]++' > "./tmp/attachment_names" | |
| # Download attachment and use the name grabbed above | |
| echo "SCRAPE_MEETINGS: Found the following agenda attachments:" | |
| while IFS= read -r LINEA1 && IFS= read -r LINEA2 <&3; do | |
| echo "SCRAPE_MEETINGS: - $LINEA2" | |
| wget --user-agent="$WGET_UA" "https://pub-london.escribemeetings.com/$LINEA1" -O "$MEETING_DIR/Attachments/$LINEA2" -q #--show-progress | |
| done < ./tmp/attachment_urls 3< ./tmp/attachment_names | |
| echo "SCRAPE_MEETINGS: All attachments saved." | |
| if [[ $AGENDA_REVISE_PDF_URL != "" ]] || [[ $AGENDA_PDF_URL != "" ]]; then | |
| if [[ $AGENDA_REVISE_PDF_URL != "" ]]; then | |
| echo "SCRAPE_MEETINGS: Saving revised agenda as PDF..." | |
| wget --user-agent="$WGET_UA" "$AGENDA_REVISE_PDF_URL" -O "$MEETING_DIR/Agenda_Revised.pdf" -q #--show-progress | |
| fi | |
| if [[ $AGENDA_PDF_URL != "" ]]; then | |
| echo "SCRAPE_MEETINGS: Saving regular agenda as PDF..." | |
| wget --user-agent="$WGET_UA" "$AGENDA_PDF_URL" -O "$MEETING_DIR/Agenda.pdf" -q #--show-progress | |
| fi | |
| else | |
| if [[ $AGENDA_REVISE_HTML_URL != "" ]]; then | |
| echo "SCRAPE_MEETINGS: Saving revised agenda as HTML... (no PDF found!)" | |
| wget --user-agent="$WGET_UA" "$AGENDA_REVISE_HTML_URL" -O "$MEETING_DIR/Agenda_Revised.html" -q #--show-progress | |
| fi | |
| if [[ $AGENDA_HTML_URL != "" ]]; then | |
| echo "SCRAPE_MEETINGS: Saving regular agenda as HTML... (no PDF found!)" | |
| wget --user-agent="$WGET_UA" "$AGENDA_HTML_URL" -O "$MEETING_DIR/Agenda.html" -q #--show-progress | |
| fi | |
| fi | |
| if [[ $MINUTES_ATTACH_PDF_URL != "" ]] || [[ $MINUTES_PDF_URL != "" ]]; then | |
| if [[ $MINUTES_ATTACH_PDF_URL != "" ]]; then | |
| echo "SCRAPE_MEETINGS: Saving minutes with attachments as PDF..." | |
| wget --user-agent="$WGET_UA" "$MINUTES_ATTACH_PDF_URL" -O "$MEETING_DIR/Minutes_With_Attachments.pdf" -q #--show-progress | |
| fi | |
| if [[ $MINUTES_PDF_URL != "" ]]; then | |
| echo "SCRAPE_MEETINGS: Saving minutes as PDF..." | |
| wget --user-agent="$WGET_UA" "$MINUTES_PDF_URL" -O "$MEETING_DIR/Minutes.pdf" -q #--show-progress | |
| fi | |
| else | |
| if [[ $MINUTES_HTML_URL != "" ]]; then | |
| echo "SCRAPE_MEETINGS: Saving minutes as HTML... (no PDF found!)" | |
| wget --user-agent="$WGET_UA" "$MINUTES_HTML_URL" -O "$MEETING_DIR/Minutes.html" -q #--show-progress | |
| fi | |
| fi | |
| fi | |
| echo "SCRAPE_MEETINGS: All files from this meeting have been saved." | |
| fi | |
| GREPMEETING=$(echo $LINE | grep "views-field-field-meeting-notes") | |
| if [[ "$GREPMEETING" != "" ]]; then | |
| MEETING_INFO=$(echo $LINE | sed -n 's/.*<div class="meeting__date">\([^<]*\)<\/div>.*/\1/p') | |
| MEETING_MONTH_WORD=$(echo "$MEETING_INFO" | sed -E 's/^([A-Za-z]+) .*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//') | |
| MEETING_DAY_SHORT=$(echo "$MEETING_INFO" | sed -E 's/^[A-Za-z]+ ([0-9]+),.*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//') | |
| MEETING_DAY=$(printf "%02d" $MEETING_DAY_SHORT) | |
| MEETING_YEAR=$(echo "$MEETING_INFO" | sed -E 's/^[A-Za-z]+ [0-9]+, ([0-9]+).*/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//') | |
| MEETING_NAME=$(echo "$MEETING_INFO" | sed -E 's/^[A-Za-z]+ [0-9]+, [0-9]+ - (.*)/\1/' | sed 's/^[ \t]*//;s/[ \t]*$//') | |
| case "$MEETING_MONTH_WORD" in | |
| January) MEETING_MONTH="01" ;; | |
| February) MEETING_MONTH="02" ;; | |
| March) MEETING_MONTH="03" ;; | |
| April) MEETING_MONTH="04" ;; | |
| May) MEETING_MONTH="05" ;; | |
| June) MEETING_MONTH="06" ;; | |
| July) MEETING_MONTH="07" ;; | |
| August) MEETING_MONTH="08" ;; | |
| September) MEETING_MONTH="09" ;; | |
| October) MEETING_MONTH="10" ;; | |
| November) MEETING_MONTH="11" ;; | |
| December) MEETING_MONTH="12" ;; | |
| *) MEETING_MONTH="--" ;; | |
| esac | |
| FOUNDMEETING="TRUE" | |
| fi | |
| done < $SEARCH_PAGE | |
| else | |
| SEARCH_END="TRUE" | |
| echo "SCRAPE_MEETINGS: No more pages!" | |
| fi | |
| else | |
| SEARCH_END="TRUE" | |
| echo "SCRAPE_MEETINGS: No more pages!" | |
| fi | |
| ((j++)) | |
| done | |
| ((i++)) | |
| done | |
| echo "Done job: SCRAPE_MEETINGS: $(date)" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment