Created
November 22, 2025 20:35
-
-
Save maurges/74b6b59c8f2dc4fd3e38cb206a79feb7 to your computer and use it in GitHub Desktop.
Scrape a certain web novel
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| cabal-version: 1.12 | |
| name: scrape | |
| version: 0.1.0.0 | |
| synopsis: Does stuff | |
| author: morj | |
| maintainer: [email protected] | |
| license: GPL3 | |
| build-type: Simple | |
| executable scrape | |
| main-is: Main.hs | |
| other-modules: | |
| Paths_scrape | |
| hs-source-dirs: | |
| ./ | |
| ghc-options: -threaded -rtsopts -with-rtsopts=-N -Wall -Wcompat | |
| build-depends: | |
| base | |
| , process | |
| , scalpel | |
| , text | |
| default-language: Haskell2010 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| {-# LANGUAGE OverloadedStrings #-} | |
| {-# LANGUAGE TypeApplications #-} | |
| {-# LANGUAGE NumericUnderscores #-} | |
| {-# LANGUAGE LambdaCase #-} | |
| module Main where | |
| import qualified Data.Text.IO as TIO | |
| import qualified Text.HTML.Scalpel as S | |
| import qualified System.Process as P | |
| import Control.Applicative ((<|>)) | |
| import Control.Concurrent (threadDelay) | |
| import Data.Function (fix, (&)) | |
| import Data.Text (Text, unpack) | |
| import Text.HTML.Scalpel ((@:), (@=)) | |
| chapterBody :: Monad m => S.ScraperT Text m Text | |
| chapterBody = S.innerHTML ("div" @: ["id" @= "content"]) | |
| nextChapter :: Monad m => S.ScraperT Text m Text | |
| nextChapter = S.attr "href" ("a" @: ["title" @= "Next Chapter"]) | |
| mbNextChapter :: Monad m => S.ScraperT Text m (Maybe Text) | |
| mbNextChapter = fmap Just nextChapter <|> pure Nothing | |
| page :: Monad m => S.ScraperT Text m (Text, Maybe Text) | |
| page = liftA2 (,) chapterBody mbNextChapter | |
| initial :: String | |
| initial = "https://novelfire.net/book/im-an-infinite-regressor-but-ive-got-stories-to-tell/chapter-1" | |
| main :: IO () | |
| main = do | |
| flip fix initial $ \again url -> do | |
| let fileName = "htmls/" <> takeLastWhile (/= '/') url <> ".html" | |
| (_, Just hout, _, handle) <- P.createProcess (P.proc "curl" [url]) | |
| { P.std_out = P.CreatePipe } | |
| pageData <- TIO.hGetContents hout | |
| _ <- P.waitForProcess handle | |
| putStrLn $ "Processing " <> url <> " into " <> fileName | |
| (body, mbNext) <- S.scrapeStringLike pageData page & \case | |
| Nothing -> error "Error parsing page" | |
| Just x -> pure x | |
| TIO.writeFile fileName body | |
| case mbNext of | |
| Nothing -> putStrLn "Reached the final chapter" | |
| Just next -> | |
| threadDelay 2_000_000 | |
| >> again (unpack next) | |
| takeLastWhile :: (a -> Bool) -> [a] -> [a] | |
| takeLastWhile p l = case span p l of | |
| ([], []) -> [] | |
| (r, []) -> r | |
| (_, _:rest) -> takeLastWhile p rest |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment