Created
February 11, 2023 20:21
-
-
Save JonathanPort/cfb2da21033d04937112b2f420bcc217 to your computer and use it in GitHub Desktop.
Url list downloaded
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| namespace App\Console\Commands; | |
| use Illuminate\Support\Str; | |
| use Illuminate\Support\Facades\Storage; | |
| use Illuminate\Support\Facades\Http; | |
| use Illuminate\Http\Client\Pool; | |
| use Illuminate\Console\Command; | |
| use App\Console\Commands\Crawler\OperatorUrl; | |
| use App\Console\Commands\Crawler\Operator; | |
| final class DownloadUrls extends Command | |
| { | |
| protected $signature = 'download-urls'; | |
| protected $description = 'Command description'; | |
| public function __construct() | |
| { | |
| parent::__construct(); | |
| } | |
| public function handle() | |
| { | |
| // Create a job ID so we can reference later | |
| $jobId = Str::random(8); | |
| // Create directory with Job ID so all files are contained | |
| Storage::makeDirectory($jobId); | |
| // Output to terminal | |
| $this->info('Starting job: ' . $jobId); | |
| // Get all operator urls from DB (70k records) | |
| $records = OperatorUrl::get(); | |
| // Create progress bar for terminal | |
| $progress = $this->output->createProgressBar(count($records)); | |
| // Number of requests per batch | |
| $cocurrent = 20; | |
| $batch = []; | |
| // Start progress bar | |
| $progress->start(); | |
| // Loop over records, see if batch is at the cocurrent limit | |
| // If not, add to batch array and move on. If batch limit reached, | |
| // take batch and run a request pool. | |
| foreach ($records as $record) { | |
| $batch[] = $record; | |
| if (count($batch) === $cocurrent) { | |
| // Put batch urls into array so we can reference later | |
| $batchUrls = []; | |
| foreach ($batch as $rec) $batchUrls[] = $rec->url; | |
| // Pool the requests | |
| $responses = Http::pool(function (Pool $pool) use ($batchUrls) { | |
| $arr = []; | |
| foreach ($batchUrls as $url) $arr[] = $pool->get($url); | |
| return $arr; | |
| }); | |
| // Loop over requests, map to the correct urls and then | |
| // run handleResponse function, which will take the request body, | |
| // store into a html file and then create a record with all info. | |
| // If the request fails, the record will still be created except | |
| // it will have a failed flag. This is so we can accurently account | |
| // for which urls failed if needed. | |
| $currentId = 0; | |
| foreach ($responses as $res) { | |
| $url = $batchUrls[$currentId]; | |
| $this->handleResponse($res, $url, $jobId); | |
| } | |
| // Reset the batch | |
| $batch = []; | |
| } | |
| $progress->advance(); | |
| } | |
| $progress->finish(); | |
| $this->info('Finished job: ' . $jobId); | |
| return Command::SUCCESS; | |
| } | |
| private function handleResponse($res, $url, $jobId) | |
| { | |
| if ($res->successful()) { | |
| $fileId = Str::random(8); | |
| Storage::put($jobId . '/' . $fileId . '.html', $res->body()); | |
| Operator::create([ | |
| 'job_id' => $jobId, | |
| 'original_url' => $url, | |
| 'filename' => $fileId . '.html', | |
| 'scrape_status' => 'downloaded', | |
| ]); | |
| } else { | |
| Operator::create([ | |
| 'job_id' => $jobId, | |
| 'original_url' => $url, | |
| 'scrape_status' => 'download_failed', | |
| ]); | |
| $this->info(' ', true); | |
| $this->error('Failed: ' . $url); | |
| $this->info(' ', true); | |
| } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment