Skip to content

Instantly share code, notes, and snippets.

@yuanw
Created March 5, 2017 23:32
Show Gist options
  • Select an option

  • Save yuanw/1751da11b2a315aeb53d7eccc8f406e2 to your computer and use it in GitHub Desktop.

Select an option

Save yuanw/1751da11b2a315aeb53d7eccc8f406e2 to your computer and use it in GitHub Desktop.
HTML parsing example
name: learn-html-conduit
version: 0.1.0.0
-- synopsis:
-- description:
homepage: https://github.com/githubuser/learn-html-conduit#readme
license: BSD3
license-file: LICENSE
author: Author name here
maintainer: [email protected]
copyright: 2017 Author name here
category: Web
build-type: Simple
extra-source-files: README.md
cabal-version: >=1.10
library
hs-source-dirs: src
exposed-modules: Lib
build-depends: base >= 4.7 && < 5
, bytestring
, http-conduit
, html-conduit
, text
, xml-conduit
default-language: Haskell2010
executable learn-html-conduit-exe
hs-source-dirs: app
main-is: Main.hs
ghc-options: -threaded -rtsopts -with-rtsopts=-N
build-depends: base
, bytestring
, http-conduit
, html-conduit
, learn-html-conduit
, text
, xml-conduit
default-language: Haskell2010
test-suite learn-html-conduit-test
type: exitcode-stdio-1.0
hs-source-dirs: test
main-is: Spec.hs
build-depends: base
, learn-html-conduit
ghc-options: -threaded -rtsopts -with-rtsopts=-N
default-language: Haskell2010
source-repository head
type: git
location: https://github.com/githubuser/learn-html-conduit
{-# LANGUAGE OverloadedStrings #-}
module Main where
import Network.HTTP.Conduit (simpleHttp)
import qualified Data.Text as T
import Text.HTML.DOM (parseLBS)
import Text.XML.Cursor (Axis, Cursor, attributeIs, content, element, fromDocument, child,
($//), (&|), (&//), (>=>))
-- The URL we're going to search
url :: String
url = "http://www.bing.com/search?q=school+of+haskell"
-- The data we're going to search for
findNodes :: Axis
findNodes = element "span" >=> attributeIs "class" "sb_count" >=> child
-- Extract the data from each node in turn
extractData :: Cursor -> T.Text
extractData = T.concat . content
-- Process the list of data elements
processData :: [T.Text] -> IO ()
processData = putStrLn . T.unpack . T.concat
cursorFor :: String -> IO Cursor
cursorFor u = do
page <- simpleHttp u
return $ fromDocument $ parseLBS page
-- test
main :: IO ()
main = do
cursor <- cursorFor url
processData $ cursor $// findNodes &| extractData
# This file was automatically generated by 'stack init'
#
# Some commonly used options have been documented as comments in this file.
# For advanced use and comprehensive documentation of the format, please see:
# http://docs.haskellstack.org/en/stable/yaml_configuration/
# Resolver to choose a 'specific' stackage snapshot or a compiler version.
# A snapshot resolver dictates the compiler version and the set of packages
# to be used for project dependencies. For example:
#
# resolver: lts-3.5
# resolver: nightly-2015-09-21
# resolver: ghc-7.10.2
# resolver: ghcjs-0.1.0_ghc-7.10.2
# resolver:
# name: custom-snapshot
# location: "./custom-snapshot.yaml"
resolver: lts-8.3
# User packages to be built.
# Various formats can be used as shown in the example below.
#
# packages:
# - some-directory
# - https://example.com/foo/bar/baz-0.0.2.tar.gz
# - location:
# git: https://github.com/commercialhaskell/stack.git
# commit: e7b331f14bcffb8367cd58fbfc8b40ec7642100a
# - location: https://github.com/commercialhaskell/stack/commit/e7b331f14bcffb8367cd58fbfc8b40ec7642100a
# extra-dep: true
# subdirs:
# - auto-update
# - wai
#
# A package marked 'extra-dep: true' will only be built if demanded by a
# non-dependency (i.e. a user package), and its test suites and benchmarks
# will not be run. This is useful for tweaking upstream packages.
packages:
- '.'
# Dependency packages to be pulled from upstream that are not in the resolver
# (e.g., acme-missiles-0.3)
extra-deps:
- http-conduit-2.2.3.1
- html-conduit-1.2.1.1
# Override default flag values for local packages and extra-deps
flags: {}
# Extra package databases containing global packages
extra-package-dbs: []
# Control whether we use the GHC we find on the path
# system-ghc: true
#
# Require a specific version of stack, using version ranges
# require-stack-version: -any # Default
# require-stack-version: ">=1.1"
#
# Override the architecture used by stack, especially useful on Windows
# arch: i386
# arch: x86_64
#
# Extra directories used by stack for building
# extra-include-dirs: [/path/to/dir]
# extra-lib-dirs: [/path/to/dir]
#
# Allow a newer minor version of GHC than the snapshot specifies
# compiler-check: newer-minor
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment