Skip to content

Instantly share code, notes, and snippets.

@ianhenrysmith
Last active November 14, 2018 21:32
Show Gist options
  • Select an option

  • Save ianhenrysmith/1891d8ad5ffc1dd744c6b65c361eda9a to your computer and use it in GitHub Desktop.

Select an option

Save ianhenrysmith/1891d8ad5ffc1dd744c6b65c361eda9a to your computer and use it in GitHub Desktop.
# bundle exec rake wikiseed:star_wars SUBDOMAIN=jammies COUNT=1
require 'open-uri'
require 'nokogiri'
namespace :wikiseed do
TYPE_MAPPINGS = {
wikipedia: {
category: 'Wikipedia',
url: 'https://en.wikipedia.org/wiki/Special:Random',
thumbnail_selectors: [
'#mw-content-text .thumbinner .thumbimage',
'.thumbimage'
]
},
star_wars: {
category: 'Star Wars',
url: 'http://starwars.wikia.com/wiki/Special:Random',
thumbnail_selectors: [
'.pi-image-thumbnail',
'#mw-content-text .image.image-thumbnail img',
'.thumbimage'
]
},
star_trek: {
category: 'Star Trek',
url: 'http://memory-alpha.wikia.com/wiki/Special:Random',
thumbnail_selectors: [
'.wiki-sidebar .thumbimage',
'#mw-content-text .image.image-thumbnail img',
'.thumbimage'
]
},
lotr: {
category: 'Lord of the Rings',
url: 'http://lotr.wikia.com/wiki/Special:Random',
thumbnail_selectors: [
'.wiki-sidebar .thumbimage',
'#mw-content-text .image.image-thumbnail img',
'.thumbimage'
]
},
matrix: {
category: 'The Matrix',
url: 'http://matrix.wikia.com/wiki/Special:Random',
thumbnail_selectors: [
'.wiki-sidebar .thumbimage',
'#mw-content-text .image.image-thumbnail img',
'.thumbimage'
]
},
marvel: {
category: 'Marvel',
url: 'http://marvel.wikia.com/wiki/Special:Random',
thumbnail_selectors: [
'.wiki-sidebar .thumbimage',
'#mw-content-text .image.image-thumbnail img',
'.thumbimage'
]
},
dc: {
category: 'DC Comics',
url: 'http://dc.wikia.com/wiki/Special:Random',
thumbnail_selectors: [
'.wiki-sidebar .thumbimage',
'#mw-content-text .image.image-thumbnail img',
'.thumbimage'
]
},
harry_potter: {
category: 'Harry Potter',
url: 'http://harrypotter.wikia.com/wiki/Special:Random',
thumbnail_selectors: [
'.wiki-sidebar .thumbimage',
'#mw-content-text .image.image-thumbnail img',
'.thumbimage'
]
}
}.freeze
task star_wars: :environment do
@page_type = :star_wars
seed_content
end
task star_trek: :environment do
@page_type = :star_trek
seed_content
end
task lotr: :environment do
@page_type = :lotr
seed_content
end
task matrix: :environment do
@page_type = :matrix
seed_content
end
task marvel: :environment do
@page_type = :marvel
seed_content
end
task dc: :environment do
@page_type = :dc
seed_content
end
task harry_potter: :environment do
@page_type = :harry_potter
seed_content
end
task wikipedia: :environment do
@page_type = :wikipedia
seed_content
end
task all: %i[star_wars star_trek lotr matrix marvel dc harry_potter wikipedia]
def wiki_url
ENV.fetch('URL') { wiki_data[:url] }
end
def created_count
@created_count ||= 0
@created_count += 1
end
def create_content(params)
puts " - creating content ##{created_count}- #{params[:title]}"
content = content_composer.compose_content(post: params, post_type_id: post_type_id)
editor = content_editor(content).update(params)
editor.post
end
def content_composer
@composer ||= ::Service::Composition::Composer.new(instance_context)
end
def content_editor(content)
::ContentService::Editor.new(instance_context, content)
end
def instance_context
@_context ||= ::Service::KapostContext.new(user, newsroom)
end
def newsroom
@newsroom ||= ::Newsroom.by_sub(subdomain)
end
def subdomain
ENV.fetch('SUBDOMAIN')
end
def count
ENV['URL'].present? ? 1 : (ENV.fetch('COUNT') { 10 }).to_i
end
def email
ENV.fetch('EMAIL') {}
end
def user
@user ||= email ? ::User.by_email(email) : newsroom.admins.first
end
def post_type_id
@post_type_id ||= createable_post_type_id
end
def users
@users ||= get_data(memberships_url)
end
def content_types
@content_types ||= instance_context.umbrella_post_types
end
def createable_post_type_id
(blog_content_type || html_content_type).id
end
def blog_content_type
content_types.detect { |ct| ct.display_name.downcase.match(/blog post/) }
end
def html_content_type
content_types.detect { |ct| ct.body_type == 'html' }
end
def doc_title(doc)
doc.title.split(' - ').first # star wars
end
def doc_url(doc)
doc.css("link[rel='canonical']").first.attribute('href').value
end
def doc_content(doc)
doc.css('#mw-content-text p').text
end
def doc_thumbnail(doc)
urls = wiki_data[:thumbnail_selectors].map do |selector|
thumbnail_url_from_selector(doc, selector)
end
processed_thumbnail_urls(urls).first
end
def thumbnail_url_from_selector(doc, selector)
return nil unless doc.css(selector).present?
return nil unless defined? doc.css(selector).first.attribute('src').value
doc.css(selector).first.attribute('src').value
end
def processed_thumbnail_urls(thumbnail_urls)
thumbnail_urls.compact.reject do |url|
[/Eras-/, /Era-/].any? { |blacklisted| url.match(blacklisted) }
end
end
def wiki_data
TYPE_MAPPINGS[@page_type]
end
def content_data(doc)
{
body: doc_content(doc),
categories: Array(wiki_data[:category]),
external_post_url: doc_url(doc),
operation: :mark_as_published_and_completed,
synced_to_gallery_date: Time.now,
title: doc_title(doc)
}
end
def seed_content
puts "> importing content from #{@page_type}"
(1..count).each do
doc = Nokogiri::HTML(open(wiki_url))
content = create_content(content_data(doc))
content.thumbnail_url = doc_thumbnail(doc)
content.save
end
end
end
@ianhenrysmith
Copy link
Author

hackathon script for importing content into kapost

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment