احصل على عنوان موقع الويب عبر الرابط

https://stackoverflow.com/questions/4348912

30-09-2019
|

سؤال

لاحظ كيف أخبار جوجل لديه مصادر في أسفل كل مقالة مقتطفات.

The Guardian - ABC News - رويترز - بلومبرج

أحاول تقليد ذلك.

على سبيل المثال ، عند تقديم عنوان URL http://www.washingtontimes.com/news/2010/dec/3/debt-panel-fails-test-vote/ أريد أن أعود The Washington Times

كيف هذا ممكن مع PHP؟

المحلول

إجابتي تتوسع في إجابة Aai W لاستخدام عنوان الصفحة. فيما يلي الرمز لإنجاز ما قاله.

<?php

function get_title($url){
  $str = file_get_contents($url);
  if(strlen($str)>0){
    $str = trim(preg_replace('/\s+/', ' ', $str)); // supports line breaks inside <title>
    preg_match("/\<title\>(.*)\<\/title\>/i",$str,$title); // ignore case
    return $title[1];
  }
}
//Example:
echo get_title("http://www.washingtontimes.com/");

?>

انتاج

واشنطن تايمز - السياسة والأخبار العاجلة والأخبار الأمريكية والعالم

كما ترون ، ليس بالضبط ما تستخدمه Google ، لذلك هذا يقودني إلى الاعتقاد بأنهم يحصلون على اسم مضيف عنوان URL ويطابقونه مع قائمته الخاصة.

http://www.washingtontimes.com/ => صحيفة واشنطن تايمز

نصائح أخرى

$doc = new DOMDocument();
@$doc->loadHTMLFile('http://www.washingtontimes.com/news/2010/dec/3/debt-panel-fails-test-vote/');
$xpath = new DOMXPath($doc);
echo $xpath->query('//title')->item(0)->nodeValue."\n";

انتاج:

تقصر لجنة الديون على اختبار التصويت - واشنطن تايمز

من الواضح أنه يجب عليك أيضًا تنفيذ معالجة الأخطاء الأساسية.

يمكنك إحضار محتويات عنوان URL والقيام ببحث تعبير منتظم عن محتوى title عنصر.

<?php
$urlContents = file_get_contents("http://example.com/");
preg_match("/<title>(.*)<\/title>/i", $urlContents, $matches);

print($matches[1] . "\n"); // "Example Web Page"
?>

أو ، إذا كنت لا ترغب في استخدام تعبير منتظم (لتتناسب مع شيء بالقرب من أعلى المستند) ، فيمكنك استخدام ملف كائن DomDocument:

<?php
$urlContents = file_get_contents("http://example.com/");

$dom = new DOMDocument();
@$dom->loadHTML($urlContents);

$title = $dom->getElementsByTagName('title');

print($title->item(0)->nodeValue . "\n"); // "Example Web Page"
?>

أترك الأمر لك لتحديد الطريقة التي تحبها بشكل أفضل.

باستخدام get_meta_tags () من الصفحة الرئيسية للمجال ، فإن NYT يعيد شيئًا قد يحتاج إلى اقتطاع ولكنه قد يكون مفيدًا.

$b = "http://www.washingtontimes.com/news/2010/dec/3/debt-panel-fails-test-vote/" ;

$url = parse_url( $b ) ;

$tags = get_meta_tags( $url['scheme'].'://'.$url['host'] );
var_dump( $tags );

يتضمن وصف "The Washington Times تقدم الأخبار العاجلة والتعليق على القضايا التي تؤثر على مستقبل أمتنا".

دليل PHP على حليقة

<?php

$ch = curl_init("http://www.example.com/");
$fp = fopen("example_homepage.txt", "w");

curl_setopt($ch, CURLOPT_FILE, $fp);
curl_setopt($ch, CURLOPT_HEADER, 0);

curl_exec($ch);
curl_close($ch);
fclose($fp);
?>

دليل PHP على مطابقة Perl Regex

<?php
$subject = "abcdef";
$pattern = '/^def/';
preg_match($pattern, $subject, $matches, PREG_OFFSET_CAPTURE, 3);
print_r($matches);
?>

ووضع هذين معًا:

<?php 
// create curl resource 
$ch = curl_init(); 

// set url 
curl_setopt($ch, CURLOPT_URL, "example.com"); 

//return the transfer as a string 
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 

// $output contains the output string 
$output = curl_exec($ch); 

$pattern = '/[<]title[>]([^<]*)[<][\/]titl/i';

preg_match($pattern, $output, $matches);

print_r($matches);

// close curl resource to free up system resources 
curl_close($ch);      
?>

لا أستطيع أن أعد هذا المثال سيعمل لأنه ليس لدي PHP هنا ، ولكن يجب أن يساعدك في البدء.

إذا كنت على استعداد لاستخدام خدمة طرف ثالث لهذا ، فقد قمت ببناء واحدة فقط على www.runway7.net/radar

يمنحك العنوان والوصف وأكثر من ذلك بكثير. على سبيل المثال ، حاول مثالك على الرادار. (http://radar.runway7.net/?url=http://www.washingtontimes.com/news/2010/dec/3/debt-panel-fails-test-vote/)

بدلاً من ذلك يمكنك استخدامه محلل HTML DOM بسيط:

<?php
require_once('simple_html_dom.php');

$html = file_get_html('http://www.washingtontimes.com/news/2010/dec/3/debt-panel-fails-test-vote/');

echo $html->find('title', 0)->innertext . "<br>\n";

echo $html->find('div[class=entry-content]', 0)->innertext;

كتبت وظيفة للتعامل معها:

 function getURLTitle($url){

    $ch = curl_init();

    curl_setopt($ch, CURLOPT_URL, $url);

    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);

    $content = curl_exec($ch);

    $contentType = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
    $charset = '';

    if($contentType && preg_match('/\bcharset=(.+)\b/i', $contentType, $matches)){
        $charset = $matches[1];
    }

    curl_close($ch);

    if(strlen($content) > 0 && preg_match('/\<title\b.*\>(.*)\<\/title\>/i', $content, $matches)){
        $title = $matches[1];

        if(!$charset && preg_match_all('/\<meta\b.*\>/i', $content, $matches)){
            //order:
            //http header content-type
            //meta http-equiv content-type
            //meta charset
            foreach($matches as $match){
                $match = strtolower($match);
                if(strpos($match, 'content-type') && preg_match('/\bcharset=(.+)\b/', $match, $ms)){
                    $charset = $ms[1];
                    break;
                }
            }

            if(!$charset){
                //meta charset=utf-8
                //meta charset='utf-8'
                foreach($matches as $match){
                    $match = strtolower($match);
                    if(preg_match('/\bcharset=([\'"])?(.+)\1?/', $match, $ms)){
                        $charset = $ms[1];
                        break;
                    }
                }
            }
        }

        return $charset ? iconv($charset, 'utf-8', $title) : $title;
    }

    return $url;
}

إنه يجلب محتوى صفحة الويب ، ويحاول الحصول على تشفير Charset المستند ((من أعلى أولوية إلى أدنى):

معلمة HTTP "charset" في حقل "نوع المحتوى".
تم تعيين إعلان META مع "HTTP-UPEIV" على "نوع المحتوى" وقيمة تعيين "Charset".
تم تعيين سمة charset على عنصر يعين مورد خارجي.

(نرى http://www.w3.org/tr/html4/charset.html)

ثم يستخدم iconv لتحويل العنوان إلى utf-8 التشفير.

احصل على عنوان موقع الويب عبر الرابط وتحويل العنوان إلى ترميز أحرف UTF-8 ：

https://gist.github.com/kisexu/b64bc6ab787f302ae838

function getTitle($url)
{
    // get html via url
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_AUTOREFERER, true);
    curl_setopt($ch, CURLOPT_HEADER, 0);
    curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36");
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
    $html = curl_exec($ch);
    curl_close($ch);

    // get title
    preg_match('/(?<=<title>).+(?=<\/title>)/iU', $html, $match);
    $title = empty($match[0]) ? 'Untitled' : $match[0];
    $title = trim($title);

    // convert title to utf-8 character encoding
    if ($title != 'Untitled') {
        preg_match('/(?<=charset\=).+(?=\")/iU', $html, $match);
        if (!empty($match[0])) {
            $charset = str_replace('"', '', $match[0]);
            $charset = str_replace("'", '', $charset);
            $charset = strtolower( trim($charset) );
            if ($charset != 'utf-8') {
                $title = iconv($charset, 'utf-8', $title);
            }
        }
    }

    return $title;
}

أحاول تجنب التعبيرات المنتظمة عندما لا يكون ذلك ضروريًا ، لقد قمت بوظيفة للحصول على عنوان موقع الويب مع Curl و DomDocument أدناه.

function website_title($url) {
   $ch = curl_init();
   curl_setopt($ch, CURLOPT_URL, $url);
   curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
   // some websites like Facebook need a user agent to be set.
   curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36');
   $html = curl_exec($ch);
   curl_close($ch);

   $dom  = new DOMDocument;
   @$dom->loadHTML($html);

   $title = $dom->getElementsByTagName('title')->item('0')->nodeValue;
   return $title;
}

echo website_title('https://www.facebook.com/');

يعود أعلاه ما يلي: مرحبًا بك في Facebook - قم بتسجيل الدخول أو التسجيل أو معرفة المزيد

مرخصة بموجب: CC-BY-SA مع الإسناد

لا تنتمي إلى StackOverflow