功能:抓取整个站点的图片,暂无借助php的curl插件开发, 后期完善
配置:config目录下
domain_name:域名(默认:bizhibar.com)
request_site:网站网址(默认:http://www.bizhibar.com/)
request_url:从网站的哪个页面开始(默认:http://www.bizhibar.com/)
accept_type: 图片类型(默认:gif, bmp, png, ico, jpg, jpeg)
save_path:图片保存路径(默认:savefiles/)
partition_name:图片保存目录名称前缀(默认:img_)
dir_file_limit: 每个目录容许多少个文件(默认:100)
serialize_img_size: 当读取了多少个图片地址才缓存到cache目录下的accompImg文件当中,下次继续抓取的时候会忽略这些地址。(默认:30)
serialize_url_size:与serialize_url_size一样,已读取多少个链接地址才缓存到cache目录
下的overURL,下次继续抓取的时候忽略这些地址。(默认:10)
说明:欢迎诸君批评指教,有任何新问题或者需要改进的地方,请您反馈给我
01.
<?php
02.
set_time_limit(0);
03.
require
dirname(
__FILE__
).DIRECTORY_SEPARATOR.
'include'
.DIRECTORY_SEPARATOR.
'Capture.const.php'
;
04.
require
__Home__.
'include'
.__Os__.
'Capture.class.php'
;
05.
06.
$_cfg
=
array
(
07.
'site'
=> __Home__.
'config'
.__Os__.
'capture.site.php'
,
08.
'preg'
=> __Home__.
'config'
.__Os__.
'capture.preg.php'
,
09.
'accompImg'
=> __Home__.
'cache'
.__Os__.
'accompImg'
,
10.
'overURL'
=> __Home__.
'cache'
.__Os__.
'overURL'
11.
);
12.
13.
$_parse
=
new
Capture(
$_cfg
);
14.
$_parse
->parseQuestUrl();
15.
16.
?>
001.
<?php
002.
/**
003.
* The main class
004.
* @author pankai<530911044@qq.com>
005.
* @date 2013-08-10
006.
*/
007.
class
Capture {
008.
private
static
$_Config
=
array
();
009.
010.
private
static
$_CapSite
= NULL;
011.
private
static
$_CapPreg
= NULL;
012.
013.
private
static
$_overURL
=
array
();
014.
015.
private
$_mark
= FALSE;
016.
private
static
$_markTime
= 1;
017.
/**
018.
* initialize the main class: Capture
019.
* @param $_cfg array
020.
*/
021.
public
function
__construct( &
$_cfg
) {
022.
self::
$_Config
= &
$_cfg
;
023.
024.
self::
$_CapSite
=
require
$_cfg
[
'site'
];
025.
self::
$_CapPreg
=
require
$_cfg
[
'preg'
];
026.
027.
foreach
( self::
$_CapPreg
as
$_key
=>
$_value
) {
028.
self::
$_CapPreg
[
$_key
] =
str_replace
(
'_request_site'
, self::
$_CapSite
[
'request_site'
],
$_value
);
029.
}
030.
031.
self::import(
'file.OperateFile'
);
032.
if
(
file_exists
(
$_cfg
[
'overURL'
] ) &&
filesize
(
$_cfg
[
'overURL'
] ) > 0 ) {
033.
$_contents
= OperateFile::readText(
$_cfg
[
'overURL'
],
filesize
(
$_cfg
[
'overURL'
] ) );
034.
self::
$_overURL
= unserialize(
$_contents
);
035.
}
036.
037.
self::import(
'pivotal.Pivotal'
);
038.
if
(
file_exists
(
$_cfg
[
'accompImg'
] ) &&
filesize
(
$_cfg
[
'accompImg'
] ) > 0 ) {
039.
$_contents
= OperateFile::readText(
$_cfg
[
'accompImg'
],
filesize
(
$_cfg
[
'accompImg'
] ) );
040.
Pivotal::
$_accompImg
= unserialize(
$_contents
);
041.
}
042.
043.
}
044.
/**
045.
* load class, follow Java pragrammer(package): import com.jUnion.Capture
046.
* @param $_class
047.
*/
048.
public
static
function
import(
$_class
) {
049.
require_once
__Home__.
'include'
.__Os__.
str_replace
(
'.'
, __Os__,
$_class
).
'.class.php'
;
050.
}
051.
052.
/**
053.
* create an instance of Pivotal class
054.
* @param $_source
055.
*/
056.
private
function
getCapInstance( &
$_source
) {
057.
$this
->_mark = FALSE;
058.
059.
$_Captal
=
new
Pivotal( self::
$_Config
,
$_source
);
060.
$_tagA
=
$_Captal
->parseUrl();
061.
062.
$this
->_mark = TRUE;
063.
064.
return
$_tagA
;
065.
}
066.
067.
/**
068.
* go forward one by one
069.
* @param $_tagArr
070.
*/
071.
private
function
roundTagA( &
$_tagArr
) {
072.
if
(
$_tagArr
== NULL ) {
073.
return
;
074.
}
075.
$_tagArrLength
=
count
(
$_tagArr
);
076.
for
(
$i
= 0;
$i
<
$_tagArrLength
;
$i
++ ) {
077.
if
(
is_array
(
$_tagArr
[
$i
] ) ) {
078.
$this
->roundTagA(
$_tagArr
[
$i
] );
079.
}
080.
else
{
081.
if
(
stripos
(
$_tagArr
[
$i
], self::
$_CapSite
[
'domain_name'
] )
082.
=== FALSE ) {
083.
continue
;
084.
}
085.
if
( in_array(
$_tagArr
[
$i
], self::
$_overURL
) ) {
086.
continue
;
087.
}
088.
self::
$_overURL
[] =
$_tagArr
[
$i
];
089.
if
(
count
( self::
$_overURL
) % self::
$_CapSite
[
'serialize_url_size'
] == 0 ) {
090.
OperateFile::setText( self::
$_Config
[
'overURL'
], serialize( self::
$_overURL
) );
091.
}
092.
do
{
093.
$_tagA
=
$this
->getCapInstance( Http::get(
$_tagArr
[
$i
] ) );
094.
sleep( self::
$_CapSite
[
'preform_page_time'
] * self::
$_markTime
);
095.
if
(
$this
->_mark === TRUE ) {
096.
self::
$_markTime
= self::
$_CapSite
[
'preform_page_time'
];
097.
break
;
098.
}
099.
self::
$_markTime
*= 2;
100.
}
while
( true );
101.
/* parse the main page and return next page */
102.
$this
->roundTagA(
$_tagA
);
103.
}
104.
}
105.
}
106.
//www.php100.com
107.
public
function
parseQuestUrl() {
108.
self::import(
'http.Http'
);
109.
$_round_Arr
=
$this
->getCapInstance( Http::get( self::
$_CapSite
[
'request_url'
] ) );
110.
$this
->roundTagA(
$_round_Arr
);
111.
}
112.
}
113.
114.
?>