爬数据问题
in Web / Web前端 on web html css 前端 - Hits()
想截取豆瓣的内容 ,发现以下问题
一般的 网站都有防止被 iframe包含的脚本 ,所以 用iframe不行
我再试 window.open打开窗口来 加载脚本 ,但是
window的 onload事件 不能 监听,可能是跨域 问题 ,那么就做不下去了
-------------------
>
(function() {
function g_log(msg) {
if (!window.mylog_win) {
window.mylog_win = window.open('', 'log');
}
var doc = window.mylog_win.document;
doc.write(msg + '<br>');
}
function DoubanTask() {
var URL = 'http://movie.douban.com/subject/';
var INDEX_BEGIN = 1000000;
var INDEX_END = 1999999;
var TRYCOUNT = 3;
var tryicount = 0;
var win = window.open('', 'doubanwin');
var url_index = INDEX_BEGIN;
function log (msg) {
g_log(msg);
};
function getData() {
var doc = Ext.get(win.document.body);
var score = doc.query('strong[property=v:average]')
if (!score || !score.length) {
log('score not find,return null.');
return null;
}
score = score[0].innerHTML;
var vote = doc.query('span[property=v:votes]')
if (!vote || !vote.length) {
log('score not find,return null.');
return null;
}
vote = vote[0].innerHTML;
var name = doc.query('span[property=v:itemreviewed]')
if (!name || !name.length) {
log('name not find,return null.');
return null;
}
name = name[0].innerHTML;
var year = doc.query('span[class=year]')
if (year && year.length) {
year = year[0].innerHTML;
}
var director = doc.query('a[rel=v:directedBy]')
if (director && director.length) {
director = director[0].innerHTML;
}
if (year) {
var i0 = year.indexOf('(');
if (-1 != i0) {
year = year.substring(i0 + 1);
}
i0 = year.indexOf(')');
if (-1 != i0) {
year = year.substring(0, i0);
}
}
return {
score : score,
vote : vote,
name : name,
year : year,
director : director
};
}
function onloadfunc(url) {
var data = getData();
if (!data) {
log('retrive no data in [' + url + '],continue.');
runtask();
return;
}
data.url = url;
log('get data:' + Ext.encode(data));
Ext.Ajax.request({
url : "mytime.ax",
params : data,
success : function() {
tryicount = 0;
log('submit ok for url:' + url);
runtask();
},
failure : function() {
tryicount += 1;
log('submit fail[' + tryicount + '] for url:' + url);
if (tryicount > TRYCOUNT) {
tryicount = 0;
log('cancel try for:' + url
+ ', move for next none.');
runtask();
} else {
log('continue try for:' + url);
onloadfunc(url);
}
}
});
}
function runtask() {
url_index += 1;
if (url_index >= INDEX_END) {
log('task complete!');
return;
}
var url = URL + url_index;
//no use, can't work
win.document.body.onload = function() {
onloadfunc(url);
};
win.location.href = url;
}
this.run = function() {
runtask();
};
}
function onload() {
var dbTask = new DoubanTask();
var btnRun1 = Ext.get('btnStart');
btnRun1.on('click', function() {
dbTask.run();
});
}
Ext.fly(window).on('load', onload);
})();
-------------------------
>
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head>
<title>run.html</title>
<meta http-equiv="description" content="this is my page">
<meta http-equiv="content-type" content="text/html; charset=ISO-8859-1">
<!--<link rel="stylesheet" type="text/css" href="./styles.css">-->
<script type="text/javascript" src="../js/ext-core.js"></script>
<script type="text/javascript" src="js/main.js"></script>
</head>
<body>
<table>
<tr><td>
<button id="btnStart">Start</button>
<td></tr>
<tr><td><iframe id="frm1"></iframe><td></tr>
</table>
</body>
</html>
-------------------------------
换取思路,可以使用ajax请求由服务端通过httpclient取得数据返回给客户端,再由客户端
写到iframe里面,这样就避开了跨域问题了。但是httpclient爬数据时要注意伪装成浏览器,加上请求头'User-Agent' : 'Mozilla/5.0 (Windows NT 5.1; rv:2.0) Gecko/20100101 Firefox/4.0',其余的就不要加了,加了反而会有问题(莫名其妙的返回)
HtmlUnit基于mozila实现,可以考虑
发现豆瓣提供api晕菜
-------------