Last active
December 2, 2015 12:03
-
-
Save jvzammit/73b625c6f88495cda036 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| import bs4 | |
| import csv | |
| from django.core.management import setup_environ | |
| from ca import settings | |
| setup_environ(settings) | |
| from buyers_guide.models import ( | |
| BuyersGuide, | |
| GuideFeature, | |
| GuideReview) | |
| OPENING_TAG = '<!--[if gte mso 9]><xml>' | |
| CLOSING_TAG = '<!--EndFragment-->' | |
| res = {} | |
| model_fields = { | |
| BuyersGuide: | |
| 'description author_note legal_note more_info references'.split(), | |
| GuideFeature: ['description'], | |
| GuideReview: ['description'], | |
| } | |
| def get_string_value(field_value): | |
| opening_tag_start = field_value.index('<!--[if gte mso 9]><xml>') | |
| opening_tag_end = ( | |
| field_value.index('<!--StartFragment-->') + | |
| len('<!--StartFragment-->')) | |
| closing_tag_start = field_value.index('<!--EndFragment-->') | |
| closing_tag_end = closing_tag_start + len('<!--EndFragment-->') | |
| # concatenate | |
| # start of string -> start of opening tag | |
| # end of opening tag -> start of closing tag | |
| # end of closing tag -> end of string | |
| cleaned_str = ( | |
| field_value[0: opening_tag_start] + | |
| field_value[opening_tag_end: closing_tag_start] + | |
| field_value[closing_tag_end: len(field_value)]) | |
| # remove any remaining p tags | |
| return bs4.BeautifulSoup(cleaned_str).get_text().strip().encode('utf-8') | |
| def get_url_affected(model_class, instance): | |
| base_url = 'http://www.consumeraffairs.com' | |
| if model_class in (GuideFeature, GuideReview): | |
| return ( | |
| base_url + instance.guide.get_absolute_url() | |
| ) if instance.guide else 'N/A' | |
| return base_url + instance.get_absolute_url() | |
| def get_admin_url(model_class, instance): | |
| _url = 'https://www.consumeraffairs.com/admin/buyers_guide/buyersguide/{}/' | |
| if model_class in (GuideFeature, GuideReview): | |
| return ( | |
| _url.format(instance.guide.pk) | |
| ) if instance.guide else 'N/A' | |
| return _url.format(instance.pk) | |
| def main(): | |
| headers = [ | |
| 'item_type', 'item_pk', 'item_field', 'current_value', 'cleaned_value', | |
| 'url_affected', 'admin_url'] | |
| with open('fixed_buyersguide_contents_revised.csv', 'w') as the_file: | |
| writer = csv.DictWriter(the_file, headers) | |
| writer.writeheader() | |
| for model_class, fields_list in model_fields.iteritems(): | |
| res[model_class.__name__] = [] | |
| for field_name in fields_list: | |
| for obj in model_class.objects.all(): | |
| field_value = getattr(obj, field_name) | |
| if OPENING_TAG in field_value: | |
| row = {} | |
| row['item_type'] = model_class.__name__ | |
| row['item_pk'] = obj.pk | |
| row['item_field'] = field_name | |
| row['current_value'] = '' # field_value | |
| row['cleaned_value'] = get_string_value(field_value) | |
| row['url_affected'] = get_url_affected( | |
| model_class, obj) | |
| row['admin_url'] = get_admin_url(model_class, obj) | |
| writer.writerow(row) | |
| if __name__ == "__main__": | |
| main() | |
| print 'DONE' | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment