Skip to content

Instantly share code, notes, and snippets.

@inilim
Last active November 16, 2025 21:34
Show Gist options
  • Select an option

  • Save inilim/56ee2d6a17e293127b352ccd1b986121 to your computer and use it in GitHub Desktop.

Select an option

Save inilim/56ee2d6a17e293127b352ccd1b986121 to your computer and use it in GitHub Desktop.
4.1# Парсинг LiveLib. Сбор href в базу и сохранение html файлов. (PHP, Crawler, HTTPClient, SQLite)
<?php
error_reporting(E_ALL);
set_time_limit(0);
date_default_timezone_set('Etc/GMT-3');
require_once __DIR__ . '/functions.php';
# Функции для работы с PDO SqLite
require_once __DIR__ . '/_INIL_connectLite.php';
require_once __DIR__ . '/vendor/autoload.php';
ini_set('memory_limit', '5024M');
timeRun();
use Symfony\Component\DomCrawler\Crawler;
use Symfony\Component\HttpClient\HttpClient;
L_INIL_DB::$pathToFileDB = 'BASE_livelib.db';
$c = HttpClient::create();
while(1)
{
$urls = L_SqlStart('SELECT * FROM urls WHERE status = 0 LIMIT 100', [], 2);
if(sizeof($urls) === 0)
{
break;
}
foreach($urls as $url)
{
msleep(mt_rand(2500, 3000));
$response = $c->request('GET', 'https://www.livelib.ru/book/' . $url['url']);
$html = getResponse($response);
if(is_array($html))
{
// ошибка
L_SqlStart('UPDATE urls SET status = 3 WHERE id = :id', [
'id' => $url['id']
]);
echo $url['id'] . PHP_EOL;
echo $html['error'] . PHP_EOL;
continue;
}
$path = createFolderTree($url['url'], 'livelib_pages', 2);
fpc($path . sha_($url['url']) . '.html', enCompress($html) );
$crw = new Crawler($html);
$list = $crw->filter('a')->each(function(Crawler $node){
return $node->attr('href');
});
$list = array_filter($list, fn($a) => (stpos($a ?? '', '/book/') === 0));
$list = am($list, function ($a){
$a = str_replace('/book/', '', $a);
return preg_replace('#\?.+#', '', $a);
});
$list = array_values( array_unique($list) );
// Прошли
L_SqlStart('UPDATE urls SET status = 1 WHERE id = :id', [
'id' => $url['id']
]);
L_INIL_DB::begin();
am($list, function($a){
L_SqlStart('INSERT INTO urls ([url], [status]) VALUES (:url, 0)',[
'url' => $a,
]);
});
L_INIL_DB::commit();
echo 'https://www.livelib.ru/book/' . $url['url'] . ' - ' . sizeof($list) . PHP_EOL;
}
}
//dd($list);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment