121
app/Services/Data/News/ImdbNewsProvider.php
Executable file
121
app/Services/Data/News/ImdbNewsProvider.php
Executable file
@@ -0,0 +1,121 @@
|
||||
<?php
|
||||
|
||||
namespace App\Services\Data\News;
|
||||
|
||||
use Illuminate\Support\Facades\Http;
|
||||
use Symfony\Component\DomCrawler\Crawler;
|
||||
|
||||
class ImdbNewsProvider
|
||||
{
|
||||
public function getArticles(): array
|
||||
{
|
||||
$compiledNews = [];
|
||||
|
||||
$html = Http::withHeaders([
|
||||
'User-Agent' =>
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' .
|
||||
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 ' .
|
||||
'Safari/537.36',
|
||||
])
|
||||
->get('https://www.imdb.com/news/top')
|
||||
->getBody()
|
||||
->getContents();
|
||||
$strippedHtml = preg_replace(
|
||||
'/<script(.*?)>(.*?)<\/script>/is',
|
||||
'',
|
||||
$html,
|
||||
);
|
||||
|
||||
$crawler = new Crawler($strippedHtml);
|
||||
|
||||
// grab every news article on the page
|
||||
foreach (
|
||||
$crawler->filter(
|
||||
'[data-testid="sub-section-news-card-section"] .ipc-list-card',
|
||||
)
|
||||
as $k => $node
|
||||
) {
|
||||
$articleCrawler = new Crawler($node);
|
||||
|
||||
// extract related people and title ids from article
|
||||
$links = $articleCrawler->filter('a')->extract(['href']);
|
||||
$imdbTitleIds = [];
|
||||
$imdbPersonIds = [];
|
||||
foreach ($links as $href) {
|
||||
preg_match('/\/title\/(tt[0-9]+)\//', $href, $titleMatches);
|
||||
preg_match('/\/name\/(nm[0-9]+)\//', $href, $nameMatches);
|
||||
if (isset($titleMatches[1])) {
|
||||
$imdbTitleIds[] = $titleMatches[1];
|
||||
}
|
||||
if (isset($nameMatches[1])) {
|
||||
$imdbPersonIds[] = $nameMatches[1];
|
||||
}
|
||||
}
|
||||
|
||||
$date = head(
|
||||
$articleCrawler
|
||||
->filter('.ipc-inline-list li')
|
||||
->extract(['_text']),
|
||||
);
|
||||
$byline = head(
|
||||
$articleCrawler
|
||||
->filter('.ipc-inline-list li')
|
||||
->eq(1)
|
||||
->extract(['_text']),
|
||||
);
|
||||
|
||||
$sourceUrl = last(
|
||||
$articleCrawler->filter('.ipc-link')->extract(['href']),
|
||||
);
|
||||
if (!isset(parse_url($sourceUrl)['scheme'])) {
|
||||
$sourceUrl = "https://imdb.com{$sourceUrl}";
|
||||
}
|
||||
$img = head($articleCrawler->filter('img')->extract(['src']));
|
||||
$body = trim(
|
||||
$articleCrawler->filter('.ipc-html-content-inner-div')->html(),
|
||||
);
|
||||
|
||||
$body = preg_replace(
|
||||
'/<div data-reactroot="">.+?<\/div>/',
|
||||
'',
|
||||
$body,
|
||||
);
|
||||
$body = strip_tags($body, '<br>');
|
||||
|
||||
if (!$img) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// transform each news article into array
|
||||
$compiledNews[$k] = [
|
||||
'title' => trim(
|
||||
head(
|
||||
$articleCrawler
|
||||
->filter('.ipc-link')
|
||||
->extract(['_text']),
|
||||
),
|
||||
),
|
||||
'body' => $body,
|
||||
'imdb_title_ids' => $imdbTitleIds,
|
||||
'imdb_person_ids' => $imdbPersonIds,
|
||||
'date' => trim($date),
|
||||
'source' => trim(
|
||||
last(
|
||||
$articleCrawler
|
||||
->filter('.ipc-link')
|
||||
->extract(['_text']),
|
||||
),
|
||||
),
|
||||
'source_url' => $sourceUrl,
|
||||
'byline' => str_replace('by ', '', trim($byline)),
|
||||
'image' => preg_replace(
|
||||
'/([A-Z]+)([0-9]+)_CR([0-9]+),([0-9]+),100,150/',
|
||||
'${1}400_CR$3,$4,270,400',
|
||||
$img,
|
||||
),
|
||||
];
|
||||
}
|
||||
|
||||
return $compiledNews;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user