Skip to content

Instantly share code, notes, and snippets.

@JonathanPort
Created February 11, 2023 20:21
Show Gist options
  • Select an option

  • Save JonathanPort/cfb2da21033d04937112b2f420bcc217 to your computer and use it in GitHub Desktop.

Select an option

Save JonathanPort/cfb2da21033d04937112b2f420bcc217 to your computer and use it in GitHub Desktop.
Url list downloaded
<?php
namespace App\Console\Commands;
use Illuminate\Support\Str;
use Illuminate\Support\Facades\Storage;
use Illuminate\Support\Facades\Http;
use Illuminate\Http\Client\Pool;
use Illuminate\Console\Command;
use App\Console\Commands\Crawler\OperatorUrl;
use App\Console\Commands\Crawler\Operator;
final class DownloadUrls extends Command
{
protected $signature = 'download-urls';
protected $description = 'Command description';
public function __construct()
{
parent::__construct();
}
public function handle()
{
// Create a job ID so we can reference later
$jobId = Str::random(8);
// Create directory with Job ID so all files are contained
Storage::makeDirectory($jobId);
// Output to terminal
$this->info('Starting job: ' . $jobId);
// Get all operator urls from DB (70k records)
$records = OperatorUrl::get();
// Create progress bar for terminal
$progress = $this->output->createProgressBar(count($records));
// Number of requests per batch
$cocurrent = 20;
$batch = [];
// Start progress bar
$progress->start();
// Loop over records, see if batch is at the cocurrent limit
// If not, add to batch array and move on. If batch limit reached,
// take batch and run a request pool.
foreach ($records as $record) {
$batch[] = $record;
if (count($batch) === $cocurrent) {
// Put batch urls into array so we can reference later
$batchUrls = [];
foreach ($batch as $rec) $batchUrls[] = $rec->url;
// Pool the requests
$responses = Http::pool(function (Pool $pool) use ($batchUrls) {
$arr = [];
foreach ($batchUrls as $url) $arr[] = $pool->get($url);
return $arr;
});
// Loop over requests, map to the correct urls and then
// run handleResponse function, which will take the request body,
// store into a html file and then create a record with all info.
// If the request fails, the record will still be created except
// it will have a failed flag. This is so we can accurently account
// for which urls failed if needed.
$currentId = 0;
foreach ($responses as $res) {
$url = $batchUrls[$currentId];
$this->handleResponse($res, $url, $jobId);
}
// Reset the batch
$batch = [];
}
$progress->advance();
}
$progress->finish();
$this->info('Finished job: ' . $jobId);
return Command::SUCCESS;
}
private function handleResponse($res, $url, $jobId)
{
if ($res->successful()) {
$fileId = Str::random(8);
Storage::put($jobId . '/' . $fileId . '.html', $res->body());
Operator::create([
'job_id' => $jobId,
'original_url' => $url,
'filename' => $fileId . '.html',
'scrape_status' => 'downloaded',
]);
} else {
Operator::create([
'job_id' => $jobId,
'original_url' => $url,
'scrape_status' => 'download_failed',
]);
$this->info(' ', true);
$this->error('Failed: ' . $url);
$this->info(' ', true);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment