爬数据问题

想截取豆瓣的内容 ,发现以下问题 
一般的 网站都有防止被 iframe包含的脚本 ,所以 用iframe不行
我再试 window.open打开窗口来 加载脚本 ,但是
window的 onload事件 不能 监听,可能是跨域 问题 ,那么就做不下去了
-------------------
>

(function() {
function g_log(msg) {
if (!window.mylog_win) {
 window.mylog_win = window.open('', 'log');
}
var doc = window.mylog_win.document;
doc.write(msg + '<br>');
}
function DoubanTask() {
var URL = 'http://movie.douban.com/subject/';
var INDEX_BEGIN = 1000000;
var INDEX_END = 1999999;
var TRYCOUNT = 3;
var tryicount = 0;
var win = window.open('', 'doubanwin');
var url_index = INDEX_BEGIN;
function log (msg) {
 g_log(msg);
};
function getData() {
 var doc = Ext.get(win.document.body);
 var score = doc.query('strong[property=v:average]')
 if (!score || !score.length) {
  log('score not find,return null.');
  return null;
 }
 score = score[0].innerHTML;
 var vote = doc.query('span[property=v:votes]')
 if (!vote || !vote.length) {
  log('score not find,return null.');
  return null;
 }
 vote = vote[0].innerHTML;
 var name = doc.query('span[property=v:itemreviewed]')
 if (!name || !name.length) {
  log('name not find,return null.');
  return null;
 }
 name = name[0].innerHTML;
 var year = doc.query('span[class=year]')
 if (year && year.length) {
  year = year[0].innerHTML;
 }
 var director = doc.query('a[rel=v:directedBy]')
 if (director && director.length) {
  director = director[0].innerHTML;
 }
 if (year) {
  var i0 = year.indexOf('(');
  if (-1 != i0) {
   year = year.substring(i0 + 1);
  }
  i0 = year.indexOf(')');
  if (-1 != i0) {
   year = year.substring(0, i0);
  }
 }
 return {
  score : score,
  vote : vote,
  name : name,
  year : year,
  director : director
 };
}
function onloadfunc(url) {
 var data = getData();
 if (!data) {
  log('retrive no data in [' + url + '],continue.');
  runtask();
  return;
 }
 data.url = url;
 log('get data:' + Ext.encode(data));
 Ext.Ajax.request({
    url : "mytime.ax",
    params : data,
    success : function() {
     tryicount = 0;
     log('submit ok for url:' + url);
     runtask();
    },
    failure : function() {
     tryicount += 1;
     log('submit fail[' + tryicount + '] for url:' + url);
     if (tryicount > TRYCOUNT) {
      tryicount = 0;
      log('cancel try for:' + url
        + ', move for next none.');
      runtask();
     } else {
      log('continue try for:' + url);
      onloadfunc(url);
     }
    }
   });
}
function runtask() {
 url_index += 1;
 if (url_index >= INDEX_END) {
  log('task complete!');
  return;
 }
 var url = URL + url_index;
 //no use, can't work
 win.document.body.onload = function() {
  onloadfunc(url);
 };
 win.location.href = url;
}
this.run = function() {
 runtask();
};
}
function onload() {
var dbTask = new DoubanTask();
var btnRun1 = Ext.get('btnStart');
btnRun1.on('click', function() {
   dbTask.run();
  });
}
Ext.fly(window).on('load', onload);
})();

-------------------------

>

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head>
    <title>run.html</title>
    <meta http-equiv="description" content="this is my page">
    <meta http-equiv="content-type" content="text/html; charset=ISO-8859-1">
    
    <!--<link rel="stylesheet" type="text/css" href="./styles.css">-->
    <script type="text/javascript" src="../js/ext-core.js"></script>
    <script type="text/javascript" src="js/main.js"></script>
</head>
<body>
 <table>
  <tr><td>
  <button id="btnStart">Start</button>
       
  <td></tr>
  <tr><td><iframe id="frm1"></iframe><td></tr>
 </table>    
</body>
</html>

-------------------------------

换取思路,可以使用ajax请求由服务端通过httpclient取得数据返回给客户端,再由客户端

写到iframe里面,这样就避开了跨域问题了。但是httpclient爬数据时要注意伪装成浏览器,加上请求头'User-Agent' : 'Mozilla/5.0 (Windows NT 5.1; rv:2.0) Gecko/20100101 Firefox/4.0',其余的就不要加了,加了反而会有问题(莫名其妙的返回)

HtmlUnit基于mozila实现,可以考虑

发现豆瓣提供api晕菜

-------------


Total views.

© 2013 - 2024. All rights reserved.

Powered by Hydejack v6.6.1