/ 详情

改进配置文件设计

Done
owner
Opened this issue  
2016-01-06 11:37

现有的配置太依赖代码,希望能支持全部放在配置文件里面,这样在最终deploy出去的时候可以方便修改。

<spiderman>
  <script>
    <![CDATA[
        $K.readLine('conf/keywords.txt').forEach(line -> {
            $Seeds.add('http://www.baidu.com/s?wd="'+line+'"');
        });
    ]]>
  </script>
  <conf>
    <property key="debug" value="true" />
    <property key="mapdb.file" value="store/mapdb" />
    <property key="mapdb.deleteFilesAfterClose" value="false" />
    <property key="zbus.enabled" value="true" />
    <property key="zbus.serverAddress" value="10.8.60.8:15555" />
    <property key="downloader.primary.threadSize" value="10" />
    <property key="downloader.secondary.threadSize" value="10" />
    <property key="parser.primary.threadSize" value="10" />
    <property key="parser.secondary.threadSize" value="10" />
    <property key="result.threadSize" value="10" />
    <property key="default.downloader" value="spiderman.HttpClientDownloader" />
    <property key="default.parser" value="spiderman.HtmlCleanerParser" />
  </conf>
  <seeds>
    <seed url="http://www.baidu.com/s?wd=Sipderman" />
  </seeds>
  <extract>
    <include path="demo-target.xml" />
    <include path="demo2-target.xml" />
    <page name="demo" extractor="spiderman.XMLExtractor">
      <url-match-rules policy="and">
        <rule type="startsWith" value="https://" />
        <rule type="!endsWith" value=".html" />
      </url-match-rules>
      <model name="items" xpath="//article">
        <field name="title" xpath="./title/text()">
          <filter>$this.substring(1)</filter>
        </field>
        <field name="content" xpath="./content/text()" />
      </model>
      <model name="contact">
        ...
      </model>
    </page>
    <page name="demo2">
      <url-match-rules policy="or">
        <rule type="regex"><![CDATA[^http://www\.baidu\.com/s?wd\=.[^&]*(pn\=\d+)?]]></rule>
        <rule type="endsWith" value=".html"/>
      </url-match-rules>
      <model>
        <field name="links" isForNewTask="true" xpath="//div[@id='links']//a" attr="href">
          <filter>$this.substring(1)</filter>
          <filter call="cleanUrl"/>
        </field>
      </model>
    </page>
  </extract>
</spiderman>

Comments (2)

已处理。最新配置结构如下:

<?xml version="1.0" encoding="UTF-8"?>
<spiderman>
    <property key="duration" value="60s" /><!-- 运行时间,可以给 {n}s {n}m {n}h {n}d -->
    <property key="logger.level" value="INFO" /><!-- 日志级别 INFO DEBUG WARN ERROR OFF -->
    <property key="worker.download.size" value="10" /><!-- 下载线程数 -->
    <property key="worker.extract.size" value="10" /><!-- 页面抽取线程数 -->
    <property key="worker.result.size" value="10" /><!-- 结果处理线程数 -->
    <property key="worker.result.handler" value="spiderman.MyResultHandler" /><!-- 自定义结果回调处理类 -->
    <property key="queue.capacity" value="5000" /><!-- 队列大小,0表示无界 -->
    <property key="queue.element.repeatable" value="true" /><!-- 队列元素是否允许重复,默认允许,若不允许,则使用重复检查器在元素入队列前进行检查 -->
    <property key="queue.checker.bdb.file" value="store/checker" /><!-- 检查器需要用到BDb存储 -->
    <property key="queue.zbus.enabled" value="false" /><!-- 队列是否使用ZBus实现, 默认否,若是,可支持分布式处理 -->
    <property key="queue.zbus.server" value="10.8.60.8:15555" /><!-- ZBus服务地址 {IP或域名}:{端口号} -->
    <property key="queue.other.names" value="SPIDERMAN_JSON_RESULT" /><!-- 注册创建其他队列备用 -->
    <!-- <seed>URL地址</seed> --><!-- 写死种子入口的方式 -->
    <script bindings="spiderman.MyBindings"><!-- 使用脚本动态创建方式 -->
    <![CDATA[
    	var K = Java.type("net.kernal.spiderman.K");
    	var kws = K.readLine("keywords.txt");
    	for (var i = 0; i < kws.length; i++) {
    		var kw = kws[i].trim();
    	    var ekw = K.urlEncode(kw);
	        $seeds.add(kw+"-baidu", "http://www.baidu.com/s?wd=" + ekw);
	        $seeds.add(kw+"-baidu-news", "http://news.baidu.com/ns?word=" + ekw);
	        $seeds.add(kw+"-baidu-zhidao", "http://zhidao.baidu.com/search?word=" + ekw);
	    }
	]]>
    </script>
    
    <!-- 页面抽取规则 -->
    <extract>
    	<!-- 注册解析器 -->
        <extractor name="HtmlCleaner" class="net.kernal.spiderman.worker.extract.HtmlCleanerExtractor" isDefault="1" />
        <extractor name="Text" class="net.kernal.spiderman.worker.extract.TextExtractor" />
        <!-- 注册过滤器 -->
        <filter name="MyFilter" class="spiderman.MyFilter" />
        <!-- 要抽取的页面 -->
        <page name="网页内容" extractor="Text" isUnique="1">
			<url-match-rule type="!contains" value="baidu" />
		</page>
		<page name="百度知道内容" isUnique="1">
			<url-match-rule type="startsWith" value="http://zhidao.baidu.com/question/" />
			<model>
				<field name="title" xpath="//h1[@accuse='qTitle']//span/text()" />
				<field name="question" xpath="//pre[@class='line mt-5 q-content']/text()" />
				<field name="answers" xpath="//div[@class='line content']/text()" isArray="1">
					<filter type="script">$this.replace('分享','').replace('评论','').replace('|','')</filter>
				</field>
				<field name="bestAnswer" xpath="//div[@class='wgt-quality mod-shadow']//div[@class='quality-content-detail content']/text()" />
			</model>
		</page>
		<page name="百度网页搜索">
			<url-match-rule type="regex"><![CDATA[(?=http://www\.baidu\.com/s\?wd\=).[^&]*(&pn\=\d+)?]]></url-match-rule>
			<model>
				<field name="详情URl" isForNewTask="1" isArray="1" xpath="//div[@id='content_left']//div[@class='result c-container ']//h3//a[@href]" attr="href" /> 
				<field name="分页URL" isForNewTask="1" isArray="1" isDistinct="1" filter="MyFilter" xpath="//div[@id='page']//a[@href]" attr="href" />
			</model>
		</page>
		<page name="百度新闻搜索">
			<url-match-rule type="regex"><![CDATA[http://news\.baidu\.com/ns\?word\=.[^&]*(&pn\=\d+)?]]></url-match-rule>
			<model>
				<field name="详情URl" isForNewTask="1" isArray="1" xpath="//div[@id='content_left']//div[@class='result']//h3//a[@href]" attr="href" /> 
				<field name="分页URL" isForNewTask="1" isArray="1" isDistinct="1" filter="MyFilter" xpath="//p[@id='page']//a[@href]" attr="href" />
			</model>
		</page>
		<page name="百度知道搜索">
			<url-match-rule type="regex"><![CDATA[http://zhidao\.baidu\.com/search\?word\=.[^&]*(&pn\=\d+)?]]></url-match-rule>
			<model>
				<field name="详情URl" isForNewTask="1" isArray="1" xpath="//div[@class='list']//dl//dt//a[@href]" attr="href" /> 
				<field name="分页URL" isForNewTask="1" isArray="1" isDistinct="1" filter="MyFilter" xpath="//div[@class='pager']//a[@href]" attr="href" />
			</model>
		</page>
	</extract>
</spiderman>

Status changed to closed

Sign in to comment

状态
Assignees
Milestones
Pull Requests
关联的 Pull Requests 被合并后可能会关闭此 issue
Branches
Planed to start   -   Planed to end
-
Top level
Priority
参与者(1)
117 l weiwei 1578913730