Last active
May 10, 2017 02:26
-
-
Save jakeoung/18bf4e0f022fe67d706ce896ab190550 to your computer and use it in GitHub Desktop.
convert bibliography for ai
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| import requests | |
| import titlecase | |
| # I doubt if we need to go above ten | |
| words_to_numerals =\ | |
| { | |
| 'first': '1', | |
| 'second': '2', | |
| 'third': '3', | |
| 'fourth': '4', | |
| 'fifth': '5', | |
| 'sixth': '6', | |
| 'seventh': '7', | |
| 'eighth': '8', | |
| 'ninth': '9', | |
| 'tenth': '10' | |
| } | |
| journals_needing_article =\ | |
| { | |
| 'Journal of Philosophy', | |
| 'Philosophical Quarterly', | |
| 'Philosophical Review' | |
| } | |
| ## JK | |
| list_abbr = ['UCLA'] | |
| def abbreviations(word, **kwargs): | |
| if word.upper() in list_abbr: | |
| return word.upper() | |
| def remove_outer_braces(s): | |
| """ | |
| str -> str | |
| Remove the outermost braces from a string if it has no other braces. | |
| (This is a first pass at getting rid of unnecessarily protected | |
| biblatex fields. I would like to also strip where there are just | |
| internal braces as in '{This {is} a test}') | |
| >>> remove_outer_braces('{This is a test}') | |
| 'This is a test' | |
| >>> remove_outer_braces('This is a test') | |
| 'This is a test' | |
| >>> remove_outer_braces('{This} is a test') | |
| '{This} is a test' | |
| """ | |
| if re.search('^{[^{}]*}$', s): | |
| s = s[1:-1] | |
| return s | |
| def full_range(s): | |
| """ str -> str | |
| Take a string representing a Biblatex page range (e.g. '100--45'). | |
| Return a string where all the units of the end are filled in. | |
| The range will be marked with two hyphens. | |
| >>> full_range('100--115') | |
| '100-115' | |
| >>> full_range('100-1000') | |
| '100-1000' | |
| >>> full_range('100-15') | |
| '100-115' | |
| >>> full_range('100-5') | |
| '100-105' | |
| """ | |
| parts = re.split('-+', s) | |
| if len(parts[1]) < len(parts[0]): | |
| difference = len(parts[0]) - len(parts[1]) | |
| parts[1] = parts[0][:difference] + parts[1] | |
| return '-'.join(parts) | |
| def remove_resolver(doi): | |
| """ | |
| str -> str | |
| Remove the 'http://dx.doi.org/' at the start of DOIs | |
| retrieved from the Crossref API. | |
| >>> remove_resolver('http://dx.doi.org/10.1080/00455091.2013.871111') | |
| '10.1080/00455091.2013.871111' | |
| >>> remove_resolver('10.1080/00455091.2013.871111') | |
| '10.1080/00455091.2013.871111' | |
| """ | |
| return re.sub('http://dx.doi.org/', '', doi) | |
| def title_name(name): | |
| """ | |
| str -> str | |
| Take a name and return it in title case, leaving 'and' alone. | |
| >>> title_name('hodgson, thomas') | |
| 'Hodgson, Thomas' | |
| >>> title_name('hodgson, thomas and CHOMSKY, NOAM') | |
| 'Hodgson, Thomas and Chomsky, Noam' | |
| """ | |
| name =\ | |
| ' '.join( | |
| [x.title() if not re.match('and', x) else x for x in name.split()] | |
| ) | |
| return name | |
| def braces(s): | |
| """ | |
| str -> str | |
| Take a string and enclose it in braces ('{', '}'), | |
| unless it already has them. | |
| >>> braces('foo') | |
| '{foo}' | |
| >>> braces('{foo}') | |
| '{foo}' | |
| """ | |
| if not s.startswith('{'): | |
| s = '{' + s | |
| if not s.endswith('}'): | |
| s = s + '}' | |
| return s | |
| def remove_eprint(record): | |
| """ | |
| Remove Eprint fields. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "eprint" in record: | |
| del record["eprint"] | |
| return record | |
| def issue_to_number(record): | |
| """ | |
| If a record has an Issue field which is a number, | |
| and doesn't have a number field, replace Issue with Number | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "issue" in record and "number" not in record and re.fullmatch('\d+', record["issue"]): | |
| record["number"] = record["issue"] | |
| del record["issue"] | |
| return record | |
| def remove_leading_zeros(record): | |
| """ | |
| Remove leading zeroes from Volume and Number fields. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "volume" in record: | |
| record["volume"] = record["volume"].lstrip('0') | |
| if "number" in record: | |
| record["number"] = record["number"].lstrip('0') | |
| return record | |
| def remove_numpages(record): | |
| """ | |
| Remove Numpages fields. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "numpages" in record: | |
| del record["numpages"] | |
| return record | |
| def remove_month(record): | |
| """ | |
| Remove Month fields. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "month" in record: | |
| del record["month"] | |
| return record | |
| def remove_series(record): | |
| """ | |
| Remove Series fields. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "series" in record: | |
| del record["series"] | |
| return record | |
| def philpapers(record): | |
| """ | |
| Put the PhilPapers ID in a field. | |
| This function assumes that the ID for the records is a PhilPapers ID. | |
| :param record: the record. | |
| :type record: dict | |
| :ret | |
| """ | |
| if re.search('-', record["ID"]): | |
| # Split into a list at hyphens | |
| segments = re.split('-', record["ID"]) | |
| # Check whether we have an ID of the form 'FOOBAR-1' | |
| if re.fullmatch('\d+', segments[-1]): | |
| ppid = '{}-{}'.format( | |
| segments[-2], | |
| segments[-1] | |
| ) | |
| else: | |
| ppid = segments[-1] | |
| record["philpapers"] = ppid | |
| return record | |
| def subtitles(record): | |
| """ | |
| Put subtitles in. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "journaltitle" in record and re.search(':', record["journaltitle"]): | |
| m = re.search(':', record["journaltitle"]) | |
| title = record["journaltitle"][:m.start()].strip() | |
| subtitle = record["journaltitle"][m.end():].strip() | |
| record["journaltitle"] = title | |
| record["journalsubtitle"] = subtitle | |
| if "title" in record and re.search(':', record["title"]): | |
| m = re.search(':', record["title"]) | |
| title = record["title"][:m.start()].strip() | |
| subtitle = record["title"][m.end():].strip() | |
| record["title"] = title | |
| record["subtitle"] = subtitle | |
| return record | |
| def add_definite_to_journaltitles(record): | |
| """ | |
| Add a definite article ('the') to titles from a specified list. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "journaltitle" in record: | |
| if record["journaltitle"] in journals_needing_article: | |
| record["journaltitle"] = 'The ' + record["journaltitle"] | |
| return record | |
| def remove_pages_from_entry(record, entry): | |
| """ | |
| Remove the 'pages' field from records with ENTRYTYPE 'incollection' or 'inbook'. | |
| :returns: dict -- the modified record. | |
| """ | |
| if record["ENTRYTYPE"] == entry: | |
| if "pages" in record: | |
| del record["pages"] | |
| return record | |
| def active_quotes(record): | |
| """ | |
| Replace LaTeX quotes with unicode quotes, | |
| defined as active characters by csquotes. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| # The regexes must be done like this to avoid balance problems | |
| # Match one or two '`', one or two ''', one '"', or one '“' | |
| # preceded by space or the start of a string | |
| for field in record: | |
| record[field] = re.sub( | |
| '(?:(?<=\s)|(?<=^))((`|\'){1,2}|\"|“)(?=\w)', | |
| '‘', | |
| record[field] | |
| ) | |
| # Match one or two ''', one '"', or one '”' | |
| # followed by space or the end of a string | |
| for field in record: | |
| record[field] = re.sub( | |
| '(?<=\w)(\'{1,2}|\"|”)(?:(?=\s)|(?=$))', | |
| '’', | |
| record[field] | |
| ) | |
| return record | |
| def remove_protection(record): | |
| """ | |
| Remove unnecessary protection. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "title" in record: | |
| record["title"] = remove_outer_braces(record["title"]) | |
| if "subtitle" in record: | |
| record["subtitle"] = remove_outer_braces(record["subtitle"]) | |
| return record | |
| def citeulike(record): | |
| """ | |
| Remove CiteULike's special fields. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "citeulike-article-id" in record: | |
| del record["citeulike-article-id"] | |
| if "priority" in record: | |
| del record["priority"] | |
| if "posted-at" in record: | |
| del record["posted-at"] | |
| return record | |
| def empty_fields(record): | |
| """ | |
| Remove empty fields. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| list_of_empty_fields = [] | |
| for field in record: | |
| if record[field] == '': | |
| list_of_empty_fields.append(field) | |
| for field in list_of_empty_fields: | |
| del record[field] | |
| return record | |
| def biblatex_page_ranges(record): | |
| if "pages" in record: | |
| # Get rid of p., pp. etc. | |
| record["pages"] = re.sub('[Pp]{1,2}\\.?', '', record["pages"]).strip() | |
| # If this is a range remove truncation and normalise it to two hyphens, | |
| # if not, complain | |
| if re.search('^\d+-+\d+$', record["pages"]): | |
| record["pages"] = record["pages"] = full_range( | |
| record["pages"] | |
| ) | |
| # The function returns a single hyphen range, | |
| # so do the normalisation afterwards | |
| record["pages"] = re.sub('-+', '--', record["pages"]) | |
| else: | |
| print( | |
| "The 'Pages' field for record {} isn't a valid biblatex range.".format( | |
| record["ID"] | |
| ) | |
| ) | |
| return record | |
| def non_page_hyphens(record): | |
| """ | |
| Replace numbers of hyphens != 2 with 2. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "volume" in record: | |
| record["volume"] = re.sub('-+', '--', record["volume"]) | |
| if "number" in record: | |
| record["number"] = re.sub('-+', '--', record["number"]) | |
| return record | |
| def dashes(record): | |
| """ | |
| Replace en and em dashes with hyphens. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| for field in record: | |
| record[field] = re.sub('–', '--', record[field]) | |
| record[field] = re.sub('—', '---', record[field]) | |
| return record | |
| def remove_keyword(record): | |
| """ | |
| Remove Keywords fields. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "keywords" in record: | |
| del record["keywords"] | |
| if "keyword" in record: | |
| del record["keyword"] | |
| return record | |
| def strip_doi(record): | |
| """ | |
| Strip resolvers from DOI fields. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "doi" in record: | |
| record["doi"] = remove_resolver(record["doi"]) | |
| return record | |
| def get_doi(record): | |
| """ | |
| Get DOIs for articles from the CrossRef API. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| #print(record["ENTRYTYPE"]) | |
| if record["ENTRYTYPE"] == "article" and "doi" not in record: | |
| # Build a search term for the API | |
| query = '' | |
| # Build a query | |
| # The API doesn't like spaces or exotic characters | |
| if "title" in record: | |
| query += re.sub('\W+', '+', record["title"]) | |
| if "author" in record: | |
| query += '+' + re.sub('\W+', '+', record["author"]) | |
| # I need to make sure a query has been built | |
| if query: | |
| payload = { | |
| 'query': query, | |
| 'rows': '1', | |
| 'sort': 'score', | |
| 'order': 'desc' | |
| } | |
| # We might not have an internet connection | |
| # Catch the exception that will raise | |
| r = requests.get( | |
| 'http://api.crossref.org/works', | |
| params=payload | |
| ) | |
| print( | |
| 'I got status code {} from the CrossRef API for record {}.'.format( | |
| r.status_code, | |
| record["ID"] | |
| ) | |
| ) | |
| # Proceed if the status code was a good one | |
| try: | |
| if r.status_code == requests.codes.ok: | |
| # The result is JSON text | |
| # Items is a list in order of match score, it will have a DOI in it | |
| # Catch exception raised by any sort of problem with the response | |
| try: | |
| doi = r.json()['message']['items'][0]['DOI'] | |
| record["doi"] = doi | |
| except (IndexError, KeyError): | |
| print("I couldn't find a DOI in the JSON for record {}.".format( | |
| record["ID"] | |
| ) | |
| ) | |
| # This deals with errors caused by encoding problems, | |
| # which are fixed anyway by having the conversion | |
| # to unicode done before authors are dealt with | |
| except UnicodeEncodeError: | |
| print( | |
| "I couldn't get a DOI. A character in record {} wasn't encoded in a way the CrossRef API understands.".format( | |
| record["ID"] | |
| ) | |
| ) | |
| return record | |
| def titlecase_name(record): | |
| """ | |
| Put authors and editors into title case. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "author" in record: | |
| record["author"] = title_name(record["author"]) | |
| if "editor" in record: | |
| record["editor"] = title_name(record["editor"]) | |
| return record | |
| def publisher(record): | |
| """ | |
| Protect 'and' in publisher field with braces around the field. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "publisher" in record: | |
| if re.search('and', record["publisher"]): | |
| record["publisher"] = braces(record["publisher"]) | |
| return record | |
| def edition(record): | |
| """ | |
| Put "Edition" in a nice format. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "edition" in record: | |
| if record["edition"].lower().strip() in words_to_numerals: | |
| record["edition"] =\ | |
| words_to_numerals[record["edition"].lower().strip()] | |
| elif re.search('\d+(st|nd|rd|th)', record["edition"].lower().strip()): | |
| record["edition"] =\ | |
| re.sub('(st|nd|rd|th)', '', record["edition"].lower().strip()) | |
| return record | |
| def journaltitle(record): | |
| """ | |
| Change "Journal" to "Journaltitle". | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "journal" in record: | |
| record["journaltitle"] = record["journal"] | |
| del record["journal"] | |
| return record | |
| def case_title(record): | |
| """ | |
| Put titles in titlecase for English records. | |
| Depends on the 'titlecase' module | |
| https://pypi.python.org/pypi/titlecase/ | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "language" not in record or record["language"] == 'English': | |
| if "title" in record: | |
| record["title"] = titlecase.titlecase(record["title"], callback=abbreviations) | |
| if "subtitle" in record: | |
| record["subtitle"] = titlecase.titlecase(record["subtitle"], callback=abbreviations) | |
| if "booktitle" in record: | |
| record["booktitle"] = titlecase.titlecase(record["booktitle"], callback=abbreviations) | |
| if "journal" in record: | |
| record["journal"] = titlecase.titlecase(record["journal"], callback=abbreviations) | |
| return record | |
| def join_author_editor(record): | |
| """ | |
| Convert authors and/or editors as lists of strings | |
| to strings joined by "and". | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "author" in record: | |
| record["author"] = " and ".join(record["author"]) | |
| if "editor" in record: | |
| record["editor"] = " and ".join([d['name'] for d in record["editor"]]) | |
| return record | |
| def booktitle(record): | |
| """ | |
| Add 'Booktitle' field identical to 'Title' field for book entries. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if record["ENTRYTYPE"] == "book": | |
| if "title" in record: | |
| record["booktitle"] = record["title"] | |
| return record | |
| def remove_abstract(record): | |
| """ | |
| Remove abstracts. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "abstract" in record: | |
| del record["abstract"] | |
| return record | |
| def remove_epub(record): | |
| """ | |
| Remove epub field. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "epub" in record: | |
| del record["issn"] | |
| return record | |
| def remove_ISSN(record): | |
| """ | |
| Remove ISSN. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "issn" in record: | |
| del record["issn"] | |
| return record | |
| def remove_ISBN(record): | |
| """ | |
| Remove ISBNs. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "isbn" in record: | |
| del record["isbn"] | |
| return record | |
| def remove_copyright(record): | |
| """ | |
| Remove copyright. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "copyright" in record: | |
| del record["copyright"] | |
| return record | |
| def language(record): | |
| """ | |
| Remove listings as English. | |
| Make sure we have both language and langid. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "language" in record and record["language"] == 'English': | |
| del record["language"] | |
| if "langid" in record: | |
| del record["langid"] | |
| elif "language" in record: | |
| record["langid"] = record["language"].lower() | |
| elif "langid" in record: | |
| print( | |
| "There is a 'Langid' of '{}'' but no 'Language' field for record {}.".format( | |
| record["langid"], | |
| record["ID"] | |
| ) | |
| ) | |
| return record | |
| def remove_publisher(record): | |
| """ | |
| Remove publisher from articles. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "publisher" in record: | |
| del record["publisher"] | |
| return record | |
| def remove_link(record): | |
| """ | |
| Remove links. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "link" in record: | |
| del record["link"] | |
| return record | |
| def remove_ampersand(record): | |
| """ | |
| Convert ampersand ('&') to 'and' | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "booktitle" in record: | |
| record["booktitle"] = re.sub(r'\\&', 'and', record["booktitle"]) | |
| if "journal" in record: | |
| record["journal"] = re.sub(r'\\&', 'and', record["journal"]) | |
| if "subtitle" in record: | |
| record["subtitle"] = re.sub(r'\\\\&', 'and', record["subtitle"]) | |
| if "title" in record: | |
| record["title"] = re.sub(r'\\\\&', 'and', record["title"]) | |
| return record | |
| def escape_characters(record): | |
| """ | |
| Make sure that characters reserved by LaTeX are escaped. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| list_of_characters = ['&', '%', '_'] | |
| for val in record: | |
| # Underscores are ok in IDs, which shouldn't have other special | |
| # characters anyway | |
| if val != "ID": | |
| for c in list_of_characters: | |
| record[val] = re.sub( | |
| '(?<!\\\\){}'.format(c), | |
| '\{}'.format(c), | |
| record[val] | |
| ) | |
| return record | |
| def jstor(record): | |
| """ | |
| Get rid of JSTOR's special fields. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "jstor_articletype" in record: | |
| del record["jstor_articletype"] | |
| if "jstor_formatteddate" in record: | |
| del record["jstor_formatteddate"] | |
| if "jstor_issuetitle" in record: | |
| del record["jstor_issuetitle"] | |
| return record | |
| def protect(s): | |
| """ | |
| Str -> Str | |
| Helper function for `protect_capitalization`. | |
| Take a string and return a string where words containing capital letters | |
| (after the first word) are protected with braces. | |
| """ | |
| needs_protection = re.findall('(?<=\s)\S*[A-Z]+\S*|(?<=:\s)\S+', s) | |
| for word in needs_protection: | |
| s = re.sub(word, '{{{}}}'.format(word), s) | |
| return s | |
| def protect_capitalisation(record): | |
| """ | |
| Protect capitalised words with braces. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "title" in record: | |
| record["title"] = protect(record["title"]) | |
| if "subtitle" in record: | |
| record["subtitle"] = protect(record["subtitle"]) | |
| if "booktitle" in record: | |
| record["booktitle"] = protect(record["booktitle"]) | |
| return record | |
| def multivolume(record): | |
| """ | |
| If a book or collection has a volume number, | |
| change its ENTRYTYPE to mvbook/mvcollection. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if record["ENTRYTYPE"] == "book": | |
| if "volume" in record: | |
| record["ENTRYTYPE"] = "mvbook" | |
| elif record["ENTRYTYPE"] == "collection": | |
| if "volume" in record: | |
| record["ENTRYTYPE"] = "mvcollection" | |
| return record | |
| def remove_booktitle(record): | |
| """ | |
| Remove 'booktitle' fields. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "booktitle" in record: | |
| del record["booktitle"] | |
| return record | |
| def year_to_date(record): | |
| """ | |
| Turn 'year' fields into 'date'. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "year" in record: | |
| record["date"] = record["year"] | |
| del record["year"] | |
| return record | |
| """ | |
| Added by koo | |
| """ | |
| list_abbrevations = [ | |
| ["[Jj]ournal", "J."], | |
| ["International", "Inc."], | |
| ["Transactions", "Trans."], | |
| ["[Aa]nalysis", "Anal."], | |
| ["[Rr]ecognition", "Recog."], | |
| ] | |
| def convert_abbreviations_ieee(record, key): | |
| if key in record: | |
| for ori, abbrev in list_abbrevations: | |
| record[key] = re.sub(ori, abbrev, record[key]) | |
| return record | |
| def remove_year_in_key(record, key): | |
| if key in record: | |
| record[key] = re.sub(r"[0-9]{4}\.* *", '', record[key]) | |
| record[key] = re.sub(r"\'[0-9]{2}", '', record[key]) | |
| pass | |
| return record | |
| list_proc = [ | |
| ["CVPR", "Computer Vision and Pattern Recognition", | |
| "Proc. IEEE Conf. Comput. Vis. Pattern Recog.", ], | |
| ["ECCV", "European Conference on Computer Vision", | |
| "Proc. Eur. Conf. Comput. Vis.", ], | |
| ["ICCV", "international conference on computer vision", | |
| "Proc. Int. Conf. Comput. Vis.", "Int'l Conf. Computer Vision"], | |
| ["BMVC", "British Machine Vision Conference", | |
| "Proc. British Mach. Vis. Conf."], | |
| ["SSVM", "Scale Space and Variational Methods in Computer Vision", | |
| "Proc. Int. Conf. on Scale Space and Variational Methods in Comput. Vis.", ], | |
| ["ICIP", "International Conference on Image Processing", | |
| "Proc. Inc. Conf. on Image Processing"] | |
| ] | |
| list_jour = [ | |
| ["PAMI", "Pattern Analysis and Machine Intelligence", | |
| "IEEE Trans. Pattern Anal. Mach. Intell."], | |
| ["IJCV", "International Journal of Computer Vision", | |
| "Int. J. of Comput. Vis."], | |
| ["TIP", "Transactions on Image Processing", | |
| "IEEE Trans. on Image Processing"], | |
| ["JMIV", "Journal of Mathematical Imaging and Vision", | |
| "J. of Math. Imaging and Vis.",] | |
| ] | |
| def unify_ai_titles(record, abbrev_degree=2, verbose=1): | |
| """ | |
| Unify AI proceedings and journals based on abbrev_degree | |
| :param record: the record. | |
| :param abbrev_degree: | |
| 0: | |
| 1: | |
| 2: IEEE Computer Society Style Guide | |
| :returns: dict -- the modified record. | |
| """ | |
| if "booktitle" in record: | |
| name = record["booktitle"] | |
| for i in range(len(list_proc)): | |
| exp = "" | |
| for val in list_proc[i]: | |
| exp += val + '|' | |
| exp = exp[:-1] | |
| if len(re.findall(exp, name, flags=re.IGNORECASE)): | |
| record["booktitle"] = list_proc[i][abbrev_degree] | |
| break | |
| if verbose: | |
| print(record["booktitle"]) | |
| elif "journal" in record: | |
| name = record["journal"] | |
| for i in range(len(list_jour)): | |
| exp = "" | |
| for val in list_jour[i]: | |
| exp += val + '|' | |
| exp = exp[:-1] | |
| if len(re.findall(exp, name, flags=re.IGNORECASE)): | |
| record["journal"] = list_jour[i][abbrev_degree] | |
| break | |
| if verbose: | |
| print(record["journal"]) | |
| return record | |
| def remove_page_if_doi(record): | |
| """ | |
| remove page if doi available | |
| :param record: | |
| :return: | |
| """ | |
| if "doi" in record: | |
| del record["page"] | |
| return record | |
| def remove_organization(record): | |
| """ | |
| Remove organization from proceedings. | |
| :param record: the record. | |
| :type record: dict | |
| :returns: dict -- the modified record. | |
| """ | |
| if "organization" in record: | |
| del record["organization"] | |
| return record |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import argparse | |
| import re | |
| import shutil | |
| import sys | |
| import cb_customs | |
| from bibtexparser.bparser import BibTexParser | |
| from bibtexparser.bwriter import BibTexWriter | |
| from bibtexparser.customization import * | |
| def fix_keys(l): | |
| """ list -> list | |
| Take a list that represents lines. | |
| Find lines which are the start of a bibtex entry without a key. | |
| Add dummy keys to those lines. | |
| Remove spaces from keys. | |
| >>> fix_keys( | |
| ['@book{foo bar,', '@article{', ' Author = {Thomas Hodgson}', '}'] | |
| ) | |
| ['@book{foobar,', '@article{Foo1,', ' Author = {Thomas Hodgson}', '}'] | |
| """ | |
| i = 1 | |
| j = 0 | |
| while j < len(l): | |
| if re.fullmatch('@\\w+\\s*{,{0,1}', l[j].strip()): | |
| l[j] = l[j][:l[j].find('{')+1] + 'Foo' + str(i) + ',' | |
| i += 1 | |
| elif re.match('@', l[j].strip()): | |
| # Find where the key starts | |
| start = re.search('{', l[j]).end() | |
| # Get rid of any non word characters | |
| ## JK | |
| # key = re.sub('\W+', '', l[j][start:]) | |
| key = l[j][start:] | |
| # Put it back together; add a comma which will have been removed | |
| l[j] = l[j][:start] + key + ',' | |
| j += 1 | |
| return l | |
| def customizations(record): | |
| """Use some functions delivered by the library | |
| :param record: a record | |
| :returns: -- customized record | |
| """ | |
| # This needs to come before authors are dealt with | |
| # otherwise there are encoding problems | |
| # record = convert_to_unicode(record) | |
| # record = page_double_hyphen(record) | |
| # record = author(record) | |
| # record = editor(record) | |
| # # This is needed after `author` is called to allow writing | |
| # record = cb_customs.join_author_editor(record) | |
| record = cb_customs.titlecase_name(record) | |
| # record = cb_customs.language(record) | |
| record = cb_customs.case_title(record) | |
| # This should come after `journaltitle`is called | |
| record = cb_customs.add_definite_to_journaltitles(record) | |
| # record = cb_customs.remove_pages_from_books_and_collections(record) | |
| record = cb_customs.non_page_hyphens(record) | |
| record = cb_customs.dashes(record) | |
| # record = cb_customs.biblatex_page_ranges(record) | |
| record = cb_customs.remove_abstract(record) | |
| record = cb_customs.remove_ISBN(record) | |
| record = cb_customs.remove_ISSN(record) | |
| record = cb_customs.remove_epub(record) | |
| record = cb_customs.remove_copyright(record) | |
| record = cb_customs.remove_publisher(record) | |
| record = cb_customs.remove_organization(record) | |
| record = cb_customs.remove_link(record) | |
| record = cb_customs.escape_characters(record) | |
| record = cb_customs.remove_ampersand(record) | |
| record = cb_customs.jstor(record) | |
| record = cb_customs.citeulike(record) | |
| record = cb_customs.edition(record) | |
| record = cb_customs.multivolume(record) | |
| record = cb_customs.strip_doi(record) | |
| record = cb_customs.remove_keyword(record) | |
| record = cb_customs.empty_fields(record) | |
| record = cb_customs.remove_protection(record) | |
| record = cb_customs.subtitles(record) | |
| record = cb_customs.remove_series(record) | |
| record = cb_customs.remove_month(record) | |
| record = cb_customs.remove_numpages(record) | |
| record = cb_customs.remove_eprint(record) | |
| # record = cb_customs.year_to_date(record) | |
| # The order of the following matters | |
| # record = cb_customs.issue_to_number(record) | |
| record = cb_customs.remove_leading_zeros(record) | |
| record = cb_customs.remove_year_in_key(record, "booktitle") | |
| record = cb_customs.unify_ai_titles(record) | |
| record = cb_customs.convert_abbreviations_ieee(record, "booktitle") | |
| record = cb_customs.convert_abbreviations_ieee(record, "journal") | |
| record = cb_customs.remove_pages_from_entry(record, "inproceedings") | |
| #record = cb_customs.remove_page_if_doi(record) | |
| if not args.nodoi: | |
| try: | |
| record = cb_customs.get_doi(record) | |
| # If there is a connection error stop trying to get DOIs | |
| except cb_customs.requests.exceptions.ConnectionError: | |
| if args.verbose: | |
| print( | |
| "I couldn't connect to the CrossRef API. " | |
| "Perhaps you are not connected to the internet?" | |
| ) | |
| args.nodoi = True | |
| return record | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('input', help='input bib file') | |
| parser.add_argument( | |
| '--no-doi', | |
| dest='nodoi', | |
| action='store_true', | |
| help="Don't look for DOIs from CrossRef" | |
| ) | |
| parser.add_argument( | |
| '--verbose', | |
| dest='verbose', | |
| action='store_true', | |
| help="Print messages" | |
| ) | |
| args = parser.parse_args() | |
| print(args.input) | |
| if args.input: | |
| bib = args.input | |
| try: | |
| # shutil.copy(bib, bib + '.backup') | |
| # if args.verbose: | |
| # print( | |
| # "I have made a backup of the orignal file at {}.backup" | |
| # .format(bib) | |
| # ) | |
| with open(bib, 'r', encoding='utf-8') as biblatex: | |
| content = biblatex.read() | |
| except FileNotFoundError: | |
| if args.verbose: | |
| print("I couldn't find the file {}.".format(bib)) | |
| sys.exit() | |
| else: | |
| content = sys.stdin.read() | |
| # Find the start of the first record | |
| try: | |
| start = re.search('@', content).start() | |
| except AttributeError: | |
| if args.verbose: | |
| print("The file I was given didn't contain any records.") | |
| sys.exit() | |
| content = content[start:].split('\n') | |
| # Provide dummy citekeys | |
| content = fix_keys(content) | |
| fixed_content = '\n'.join(content) | |
| bibliography = BibTexParser( | |
| fixed_content, | |
| customization=customizations, | |
| ignore_nonstandard_types=False | |
| # Otherwise bibtexparser will complain if I give it a collection | |
| ) | |
| output = BibTexWriter().write(bibliography) | |
| if args.input: | |
| output_name = args.input[:-4]+'_convert.bib' | |
| with open(output_name, 'w', encoding='utf-8') as biblatex: | |
| biblatex.write(output) | |
| else: | |
| sys.stdout.write(output) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment