[免费]土豆网视频专辑采集接口详解

接上篇:
[免费下载]土豆网专辑采集规则+接口
http://www.4wei.cn/archives/1001491

以下是接口全文。
我给一些重要步骤做了注释。

tudou.php下载

<?php
/*
设置接口为60秒连接,防止采集远程链接超时
*/
set_time_limit(60);

/*
获取火车采集器传过来的ID,组装出土豆的专辑链接
*/
$lid = intval($_GET["lid"]);
if(empty($lid)) exit();

$uri = sprintf("http://www.tudou.com/playlist/p/l%d.html", $lid);

/*
使用CUrl函数进行远程链接的内容获取,就是采集过程
*/
$html= curl($uri);

/*
如果没有采集到内容,则重试3次
*/
if(empty($html)) $html= curl($uri);
if(empty($html)) $html= curl($uri);
if(empty($html)) $html= curl($uri);

/*
如果采集的网页进行了Gzip压缩,则调用gzdecode函数进行解压,以获取明文
*/
$html= strpos($html, "<")==0 ? $html : gzdecode($html);

/*
使用正则获取连续的视频ID及标题
*/
preg_match_all('@iid:(\d+)\s+?,title:"([^"]+)"@is', $html, $match);

/*
遍历所有视频,组装出Discuz的Flash视频播放代码。
*/
foreach ($match[1] as $key=>$iid)
{
$code = sprintf("[flash]http://www.tudou.com/player/outside/player_outside_list.swf?iid=%d&default_skin=http://js.tudouui.com/bin/player2/outside/Skin_outside_list_7.swf&autoPlay=false&listType=1&rurl=&autostart=false&lid=%d[/flash]", $iid, $lid);

/*
最后组装出方便火车采集器循环采集的代码格式
*/
printf("<!--titlestart-->%s<!--titleend--><!--codestart-->$code<!--codeend-->", $match[2][$key]);
}

function curl($url) {
$ch = curl_init ();
curl_setopt ( $ch, CURLOPT_URL, $url );
curl_setopt ( $ch, CURLOPT_RETURNTRANSFER, 1 );
curl_setopt ( $ch, CURLOPT_TIMEOUT, 2 );
curl_setopt ( $ch, CURLOPT_ENCODING, "gzip" );

$data = curl_exec ( $ch );

if (curl_errno ( $ch )) {
/*debug info*/
return curl_error ( $ch );
} else {
curl_close ( $ch );
}

return $data;
}

function gzdecode($data) {
$len = strlen($data);
if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) {
return null; // Not GZIP format (See RFC 1952)
}
$method = ord(substr($data,2,1)); // Compression method
$flags = ord(substr($data,3,1)); // Flags
if ($flags & 31 != $flags) {
// Reserved bits are set -- NOT ALLOWED by RFC 1952
return null;
}
// NOTE: $mtime may be negative (PHP integer limitations)
$mtime = unpack("V", substr($data,4,4));
$mtime = $mtime[1];
$xfl   = substr($data,8,1);
$os    = substr($data,8,1);
$headerlen = 10;
$extralen = 0;
$extra     = "";
if ($flags & 4) {
// 2-byte length prefixed EXTRA data in header
if ($len - $headerlen - 2 < 8) {
return false;    // Invalid format
}
$extralen = unpack("v",substr($data,8,2));
$extralen = $extralen[1];
if ($len - $headerlen - 2 - $extralen < 8) {
return false;    // Invalid format
}
$extra = substr($data,10,$extralen);
$headerlen += 2 + $extralen;
}
$filenamelen = 0;
$filename = "";
if ($flags & 8) {
// C-style string file NAME data in header
if ($len - $headerlen - 1 < 8) {
return false;    // Invalid format
}
$filenamelen = strpos(substr($data,8+$extralen),chr(0));
if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) {
return false;    // Invalid format
}
$filename = substr($data,$headerlen,$filenamelen);
$headerlen += $filenamelen + 1;
}
$commentlen = 0;
$comment = "";
if ($flags & 16) {
// C-style string COMMENT data in header
if ($len - $headerlen - 1 < 8) {
return false;    // Invalid format
}
$commentlen = strpos(substr($data,8+$extralen+$filenamelen),chr(0));
if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) {
return false;    // Invalid header format
}
$comment = substr($data,$headerlen,$commentlen);
$headerlen += $commentlen + 1;
}
$headercrc = "";
if ($flags & 1) {
// 2-bytes (lowest order) of CRC32 on header present
if ($len - $headerlen - 2 < 8) {
return false;    // Invalid format
}
$calccrc = crc32(substr($data,0,$headerlen)) & 0xffff;
$headercrc = unpack("v", substr($data,$headerlen,2));
$headercrc = $headercrc[1];
if ($headercrc != $calccrc) {
return false;    // Bad header CRC
}
$headerlen += 2;
}
// GZIP FOOTER - These be negative due to PHP's limitations
$datacrc = unpack("V",substr($data,-8,4));
$datacrc = $datacrc[1];
$isize = unpack("V",substr($data,-4));
$isize = $isize[1];
// Perform the decompression:
$bodylen = $len-$headerlen-8;
if ($bodylen < 1) {
// This should never happen - IMPLEMENTATION BUG!
return null;
}
$body = substr($data,$headerlen,$bodylen);
$data = "";
if ($bodylen > 0) {
switch ($method) {
case 8:
// Currently the only supported compression method:
$data = gzinflate($body);
break;
default:
// Unknown compression method
return false;
}
} else {
// I'm not sure if zero-byte body content is allowed.
// Allow it for now... Do nothing...
}
// Verifiy decompressed size and CRC32:
// NOTE: This may fail with large data sizes depending on how
//       PHP's integer limitations affect strlen() since $isize
//       may be negative for large sizes.
if ($isize != strlen($data) || crc32($data) != $datacrc) {
// Bad format! Length or CRC doesn't match!
return false;
}
return $data;
}

?>

发表评论

电子邮件地址不会被公开。