代码拉取完成,页面将自动刷新
现有的配置太依赖代码,希望能支持全部放在配置文件里面,这样在最终deploy出去的时候可以方便修改。
<spiderman>
<script>
<![CDATA[
$K.readLine('conf/keywords.txt').forEach(line -> {
$Seeds.add('http://www.baidu.com/s?wd="'+line+'"');
});
]]>
</script>
<conf>
<property key="debug" value="true" />
<property key="mapdb.file" value="store/mapdb" />
<property key="mapdb.deleteFilesAfterClose" value="false" />
<property key="zbus.enabled" value="true" />
<property key="zbus.serverAddress" value="10.8.60.8:15555" />
<property key="downloader.primary.threadSize" value="10" />
<property key="downloader.secondary.threadSize" value="10" />
<property key="parser.primary.threadSize" value="10" />
<property key="parser.secondary.threadSize" value="10" />
<property key="result.threadSize" value="10" />
<property key="default.downloader" value="spiderman.HttpClientDownloader" />
<property key="default.parser" value="spiderman.HtmlCleanerParser" />
</conf>
<seeds>
<seed url="http://www.baidu.com/s?wd=Sipderman" />
</seeds>
<extract>
<include path="demo-target.xml" />
<include path="demo2-target.xml" />
<page name="demo" extractor="spiderman.XMLExtractor">
<url-match-rules policy="and">
<rule type="startsWith" value="https://" />
<rule type="!endsWith" value=".html" />
</url-match-rules>
<model name="items" xpath="//article">
<field name="title" xpath="./title/text()">
<filter>$this.substring(1)</filter>
</field>
<field name="content" xpath="./content/text()" />
</model>
<model name="contact">
...
</model>
</page>
<page name="demo2">
<url-match-rules policy="or">
<rule type="regex"><![CDATA[^http://www\.baidu\.com/s?wd\=.[^&]*(pn\=\d+)?]]></rule>
<rule type="endsWith" value=".html"/>
</url-match-rules>
<model>
<field name="links" isForNewTask="true" xpath="//div[@id='links']//a" attr="href">
<filter>$this.substring(1)</filter>
<filter call="cleanUrl"/>
</field>
</model>
</page>
</extract>
</spiderman>
已处理。最新配置结构如下:
<?xml version="1.0" encoding="UTF-8"?>
<spiderman>
<property key="duration" value="60s" /><!-- 运行时间,可以给 {n}s {n}m {n}h {n}d -->
<property key="logger.level" value="INFO" /><!-- 日志级别 INFO DEBUG WARN ERROR OFF -->
<property key="worker.download.size" value="10" /><!-- 下载线程数 -->
<property key="worker.extract.size" value="10" /><!-- 页面抽取线程数 -->
<property key="worker.result.size" value="10" /><!-- 结果处理线程数 -->
<property key="worker.result.handler" value="spiderman.MyResultHandler" /><!-- 自定义结果回调处理类 -->
<property key="queue.capacity" value="5000" /><!-- 队列大小,0表示无界 -->
<property key="queue.element.repeatable" value="true" /><!-- 队列元素是否允许重复,默认允许,若不允许,则使用重复检查器在元素入队列前进行检查 -->
<property key="queue.checker.bdb.file" value="store/checker" /><!-- 检查器需要用到BDb存储 -->
<property key="queue.zbus.enabled" value="false" /><!-- 队列是否使用ZBus实现, 默认否,若是,可支持分布式处理 -->
<property key="queue.zbus.server" value="10.8.60.8:15555" /><!-- ZBus服务地址 {IP或域名}:{端口号} -->
<property key="queue.other.names" value="SPIDERMAN_JSON_RESULT" /><!-- 注册创建其他队列备用 -->
<!-- <seed>URL地址</seed> --><!-- 写死种子入口的方式 -->
<script bindings="spiderman.MyBindings"><!-- 使用脚本动态创建方式 -->
<![CDATA[
var K = Java.type("net.kernal.spiderman.K");
var kws = K.readLine("keywords.txt");
for (var i = 0; i < kws.length; i++) {
var kw = kws[i].trim();
var ekw = K.urlEncode(kw);
$seeds.add(kw+"-baidu", "http://www.baidu.com/s?wd=" + ekw);
$seeds.add(kw+"-baidu-news", "http://news.baidu.com/ns?word=" + ekw);
$seeds.add(kw+"-baidu-zhidao", "http://zhidao.baidu.com/search?word=" + ekw);
}
]]>
</script>
<!-- 页面抽取规则 -->
<extract>
<!-- 注册解析器 -->
<extractor name="HtmlCleaner" class="net.kernal.spiderman.worker.extract.HtmlCleanerExtractor" isDefault="1" />
<extractor name="Text" class="net.kernal.spiderman.worker.extract.TextExtractor" />
<!-- 注册过滤器 -->
<filter name="MyFilter" class="spiderman.MyFilter" />
<!-- 要抽取的页面 -->
<page name="网页内容" extractor="Text" isUnique="1">
<url-match-rule type="!contains" value="baidu" />
</page>
<page name="百度知道内容" isUnique="1">
<url-match-rule type="startsWith" value="http://zhidao.baidu.com/question/" />
<model>
<field name="title" xpath="//h1[@accuse='qTitle']//span/text()" />
<field name="question" xpath="//pre[@class='line mt-5 q-content']/text()" />
<field name="answers" xpath="//div[@class='line content']/text()" isArray="1">
<filter type="script">$this.replace('分享','').replace('评论','').replace('|','')</filter>
</field>
<field name="bestAnswer" xpath="//div[@class='wgt-quality mod-shadow']//div[@class='quality-content-detail content']/text()" />
</model>
</page>
<page name="百度网页搜索">
<url-match-rule type="regex"><![CDATA[(?=http://www\.baidu\.com/s\?wd\=).[^&]*(&pn\=\d+)?]]></url-match-rule>
<model>
<field name="详情URl" isForNewTask="1" isArray="1" xpath="//div[@id='content_left']//div[@class='result c-container ']//h3//a[@href]" attr="href" />
<field name="分页URL" isForNewTask="1" isArray="1" isDistinct="1" filter="MyFilter" xpath="//div[@id='page']//a[@href]" attr="href" />
</model>
</page>
<page name="百度新闻搜索">
<url-match-rule type="regex"><![CDATA[http://news\.baidu\.com/ns\?word\=.[^&]*(&pn\=\d+)?]]></url-match-rule>
<model>
<field name="详情URl" isForNewTask="1" isArray="1" xpath="//div[@id='content_left']//div[@class='result']//h3//a[@href]" attr="href" />
<field name="分页URL" isForNewTask="1" isArray="1" isDistinct="1" filter="MyFilter" xpath="//p[@id='page']//a[@href]" attr="href" />
</model>
</page>
<page name="百度知道搜索">
<url-match-rule type="regex"><![CDATA[http://zhidao\.baidu\.com/search\?word\=.[^&]*(&pn\=\d+)?]]></url-match-rule>
<model>
<field name="详情URl" isForNewTask="1" isArray="1" xpath="//div[@class='list']//dl//dt//a[@href]" attr="href" />
<field name="分页URL" isForNewTask="1" isArray="1" isDistinct="1" filter="MyFilter" xpath="//div[@class='pager']//a[@href]" attr="href" />
</model>
</page>
</extract>
</spiderman>
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。
状态更改为 已关闭
登录 后才可以发表评论