Files
archived-web-master/scripts/rss_parser

238 lines
8.7 KiB
Plaintext

<?php
// This file contains code to scrape the PHP.net homepage and
// generate RSS news feed information out of that (included
// in the update-backend PHP-CLI script)
// Thanks to Adolfo Garcia Veytia for contributing the
// original version of this code to php.net
// Return the whole file in a string
function getData($fname) {
if (!is_readable($fname)) { return false; }
$fp = fopen($fname, "r");
if (!$fp) { return false; }
$raw_code = fread($fp, filesize ($fname));
fclose($fp);
return $raw_code;
}
// Try to find a link in the text with any the used linking methods
function scanLinks($text, $rootLink) {
if (preg_match('/<a\s+name="([^"]*)">/', $text, $matches)) {
$link = "#" .$matches[1];
} elseif (preg_match('/<\?php\s+print_link\s*\("([^"]+)",\s*"[^"]+"\);\s*\?>/', $text, $matches)) {
$link = $matches[1];
} elseif (preg_match('/<\?php\s+print_link\s*\(\'([^\']+)\',\s*\'[^\']+\'\);\s*\?>/', $text, $matches)) {
$link = $matches[1];
} elseif (preg_match('/<\?php\s+print_link\s*\("([^"]+)",\s*make_image\s*\([^\)]*\)\s*\);\s*\?>/', $text, $matches)) {
$link = $matches[1];
} elseif (preg_match('/<a\s+href="([^"]*)">/', $text, $matches)) {
$link = $matches[1];
}
// Make sure it is a full URL
if (!preg_match('/^http:/', $link)) {
$rootLink = rtrim($rootLink, "/");
$link = ($link{0} != '/') ? "$rootLink/$link" : "$rootLink$link";
}
return $link;
}
// Preserve parts in the text needed and drop out everything unsusable
function ProcessText($text) {
// Delete images, since this data will go through XML
$text = preg_replace('/<\?php\s+echo\s+make_image\s*\("([^"]*)",\s*"([^"]*)",\s+"([^"]*)"\);\s*\?>/i', " ", $text);
// Replace the links with <a> links (use this or the one below)
/*
$text = preg_replace('/<\?php\s+print_link\s*\("([^"]+)",\s*"([^"]+)"\);\s*\?>/', "<a href=\"$1\">$2</a>", $text); // <?php
$text = preg_replace('/<\?php\s+print_link\s*\(\'([^\']+)\',\s*\'([^\']+)\'\);\s*\?>/', "<a href=\"$1\">$2</a>", $text);
$text = preg_replace('/<\?php\s+print_link\s*\("([^"]+)",\s*make_image\s*\([^\)]*\)\s*\);\s*\?>/', "", $text);
*/
// Remove the hyperlink references (use this or the one above)
$text = preg_replace('/<\?php\s+print_link\s*\("([^"]+)",\s*"([^"]+)"\);\s*\?>/', "$2", $text);
$text = preg_replace('/<\?php\s+print_link\s*\(\'([^\']+)\',\s*\'([^\']+)\'\);\s*\?>/', "$2", $text);
$text = preg_replace('/<\?php\s+print_link\s*\("([^"]+)",\s*make_image\s*\([^\)]*\)\s*\);\s*\?>/', "", $text);
// Drop HTML, trim string and drop multiple spaces
$text = trim(strip_tags($text));
return preg_replace("!\\s+!", " ", $text);
}
// Parse the index file searching for news item information
function ParseNews ($index_page = "", $aboutLink) {
// Remove commented items
//$index_page = preg_replace("/<!--[\w\W]*?-->/", "", $index_page);
// Split the file by newlines
$lines = preg_split("/\n/", $index_page);
#DEBUG# print "<pre>"; print_r($lines); print "</pre>";
// Define month conversion hash
$mos = array(
"Jan" => 1, "Feb" => 2, "Mar" => 3, "Apr" => 4, "May" => 5, "Jun" => 6,
"Jul" => 7, "Aug" => 8, "Sep" => 9, "Oct" => 10, "Nov" => 11, "Dec" => 12
);
// We have not started to parse the
// news and we have no headlines right here
$news_started = FALSE;
$headlineid = 0;
// Try to classify every line backed with state information
// and patterns to recognize for news item elements
while (list($i, $line) = each($lines)) {
// We are not in a news item
if (!$news_started) {
// If we found theis comment, then we are at the right place
if (strpos($line, "DO NOT REMOVE THIS COMMENT")) { $news_started = TRUE; }
else { continue; }
// We are in a news item
} elseif ($news_started) {
// Headline separator reached
if (preg_match('!<hr />!', $line)) {
$headlineid++;
#DEBUG# print "<p>Info: New Headline: $headlineid<br />";
// End of headlines reached
} elseif (preg_match('@<a href="/archive/index.php">News Archive</a>@', $line) || strpos($line, "// NO MORE NEWS TO PARSE") === 0) {
array_pop($headlines);
break;
// The headline title is in <h1> tags [it needs to be be on line line!]
} elseif (preg_match('/<h1>(.*)<\/h1>/i', $line, $matches)) {
$headlines[$headlineid]['title'] = "$matches[1]";
#DEBUG# print "Title: $matches[1]<br />";
// Dates are below the headline title
} elseif (preg_match('/<span.*>\[(\d+)-(\S*)-(\d+)\]<\/span>/', $line, $matches)) {
$headlines[$headlineid]['date'] = mktime(1,1,1, $mos[$matches[2]], $matches[1], $matches[3]);
#DEBUG# print "Date: $matches[1] $matches[2] $matches[3]<br />";
// Subjects (i.e RDF category)
} elseif (preg_match("/<!-- SUBJECT: (.*?) -->/", $line, $matches)) {
$headlines[$headlineid]['subject'] = $matches[1];
// Everything else is part of the headline text
} else {
if (!preg_match('/^\s*\?>\s*$/', $line)) {
if (isset($headlines[$headlineid]['text'])) {
$headlines[$headlineid]['text'] .= " $line";
} else {
$headlines[$headlineid]['text'] = " $line";
}
}
}
}
}
// Cycle through the headlines
foreach ($headlines as $num => $headline) {
// The first link found is THE link for the news item
$headlines[$num]['link'] = scanLinks($headline['text'], $aboutLink);
// And the text needs to be cleaned up
$headlines[$num]['text'] = ProcessText($headline['text']);
// And date needs to be reformatted
$headlines[$num]['date'] = date("Y-m-d", $headline['date']);
}
return $headlines;
}
// Generate RSS header text and inject it into $RSS
function GenerateRSSHeader($headlines, &$RSS, $aboutLink) {
$RSS .= "<" . "?xml version=\"1.0\" encoding=\"utf-8\"?>\n" .
"<rdf:RDF\n" .
"\txmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n" .
"\txmlns=\"http://purl.org/rss/1.0/\"\n" .
"\txmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n" .
">\n" .
"<channel rdf:about=\"$aboutLink\">\n" .
"\t<title>PHP: Hypertext Preprocessor</title>\n" .
"\t<link>$aboutLink</link>\n" .
"\t<description>The PHP scripting language web site</description>\n" .
"\t<items>\n" .
"\t\t<rdf:Seq>\n";
// Cycle through all the Resources on the RSS
foreach ((array)$headlines as $headline) {
$RSS .= "\t\t\t<rdf:li rdf:resource=\"" . $headline['link'] . "\" />\n";
}
$RSS .= "\t\t</rdf:Seq>\n\t</items>\n</channel>\n";
}
// Add RSS footer information to $RSS
function GenerateRSSFooter(&$RSS) {
$RSS .= "</rdf:RDF>\n";
}
// Add an RSS item's information to $RSS
function GenerateRSSItem($href, $title, $text, $date, $subject, &$RSS) {
if($subject) {
$s = "\t<dc:subject>$subject</dc:subject>\n";
} else {
$s = "";
}
$RSS .= "\n<item rdf:about=\"$href\">\n" .
"\t<title>$title</title>\n" .
"\t<link>$href</link>\n" .
$s .
"\t<description>$text</description>\n" .
"\t<dc:date>" . $date . "</dc:date>\n" .
"</item>\n";
}
function GenerateRSSFile($root, $aboutLink) {
// Get the PHP.net index page's source code
$homepage = getData("$root/index.php");
// This returns a data structure containing all the news items found
$hlines = ParseNews($homepage, $aboutLink);
// Start with an empty RSS string
$RSS = '';
// Generate the RSS Header
GenerateRSSHeader($hlines, $RSS, $aboutLink);
// Add separator comment
$RSS .= "<!-- RSS-Items -->\n";
// Add every news item to the feed
foreach ($hlines as $hline) {
GenerateRSSItem($hline['link'], $hline['title'], $hline['text'], $hline['date'], isset($hline['subject']) ? $hline['subject'] : false, $RSS);
}
// Add end separator
$RSS .= "<!-- / RSS-Items PHP/RSS -->\n";
// Dump the last XML tag
GenerateRSSFooter($RSS);
#DEBUG# echo $RSS;
return $RSS;
}
#$RSSNews = GenerateRSSFile($root, "http://php.net/");
#$RSSConf = GenerateRSSFile("$root/conferences", "http://php.net/conferences/");
?>