Skip to content

Instantly share code, notes, and snippets.

@maurges
Created November 22, 2025 20:35
Show Gist options
  • Select an option

  • Save maurges/74b6b59c8f2dc4fd3e38cb206a79feb7 to your computer and use it in GitHub Desktop.

Select an option

Save maurges/74b6b59c8f2dc4fd3e38cb206a79feb7 to your computer and use it in GitHub Desktop.
Scrape a certain web novel
cabal-version: 1.12
name: scrape
version: 0.1.0.0
synopsis: Does stuff
author: morj
maintainer: [email protected]
license: GPL3
build-type: Simple
executable scrape
main-is: Main.hs
other-modules:
Paths_scrape
hs-source-dirs:
./
ghc-options: -threaded -rtsopts -with-rtsopts=-N -Wall -Wcompat
build-depends:
base
, process
, scalpel
, text
default-language: Haskell2010
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE NumericUnderscores #-}
{-# LANGUAGE LambdaCase #-}
module Main where
import qualified Data.Text.IO as TIO
import qualified Text.HTML.Scalpel as S
import qualified System.Process as P
import Control.Applicative ((<|>))
import Control.Concurrent (threadDelay)
import Data.Function (fix, (&))
import Data.Text (Text, unpack)
import Text.HTML.Scalpel ((@:), (@=))
chapterBody :: Monad m => S.ScraperT Text m Text
chapterBody = S.innerHTML ("div" @: ["id" @= "content"])
nextChapter :: Monad m => S.ScraperT Text m Text
nextChapter = S.attr "href" ("a" @: ["title" @= "Next Chapter"])
mbNextChapter :: Monad m => S.ScraperT Text m (Maybe Text)
mbNextChapter = fmap Just nextChapter <|> pure Nothing
page :: Monad m => S.ScraperT Text m (Text, Maybe Text)
page = liftA2 (,) chapterBody mbNextChapter
initial :: String
initial = "https://novelfire.net/book/im-an-infinite-regressor-but-ive-got-stories-to-tell/chapter-1"
main :: IO ()
main = do
flip fix initial $ \again url -> do
let fileName = "htmls/" <> takeLastWhile (/= '/') url <> ".html"
(_, Just hout, _, handle) <- P.createProcess (P.proc "curl" [url])
{ P.std_out = P.CreatePipe }
pageData <- TIO.hGetContents hout
_ <- P.waitForProcess handle
putStrLn $ "Processing " <> url <> " into " <> fileName
(body, mbNext) <- S.scrapeStringLike pageData page & \case
Nothing -> error "Error parsing page"
Just x -> pure x
TIO.writeFile fileName body
case mbNext of
Nothing -> putStrLn "Reached the final chapter"
Just next ->
threadDelay 2_000_000
>> again (unpack next)
takeLastWhile :: (a -> Bool) -> [a] -> [a]
takeLastWhile p l = case span p l of
([], []) -> []
(r, []) -> r
(_, _:rest) -> takeLastWhile p rest
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment