`
ggsonic
  • 浏览: 259258 次
  • 性别: Icon_minigender_1
  • 来自: 北京
社区版块
存档分类
最新评论

nutch 2.0 search accumulo solr

 
阅读更多
1. http://www.covert.io/post/18414889381/accumulo-nutch-and-gora
2. http://blog.packetloop.com/2012/03/packetpig-open-source-big-data-security.html
3. http://blog.csdn.net/lengyue365/article/details/7874003
4. http://nlp.solutions.asia/?p=232
5. http://wiki.apache.org/nutch/NewScoringIndexingExample

环境说明:
accumulo 1.5
nutch 2.0 nutchgora git-branch
hadoop 0.20.2
zookeeper 3.4.3
gora
solr 3.6.1

webpage数据说明:
<gora-orm>
    
    <table name="webpage">
        <family name="p" /> <!-- This can also have params like compression, bloom filters -->
        <family name="f" />
        <family name="s" />
        <family name="il" />
        <family name="ol" />
        <family name="h" />
        <family name="mtdt" />
        <family name="mk" />
        <config key="table.file.compress.blocksize" value="32K"/>
    </table>
    <class table="webpage" keyClass="java.lang.String" name="org.apache.nutch.storage.WebPage">
        
        <!-- fetch fields                                       -->
        <field name="baseUrl" family="f" qualifier="bas"/>
        <field name="status" family="f" qualifier="st"/>
        <field name="prevFetchTime" family="f" qualifier="pts"/>
        <field name="fetchTime" family="f" qualifier="ts"/>
        <field name="fetchInterval" family="f" qualifier="fi"/>
        <field name="retriesSinceFetch" family="f" qualifier="rsf"/>
        <field name="reprUrl" family="f" qualifier="rpr"/>
        <field name="content" family="f" qualifier="cnt"/>
        <field name="contentType" family="f" qualifier="typ"/>
        <field name="protocolStatus" family="f" qualifier="prot"/>
        <field name="modifiedTime" family="f" qualifier="mod"/>
        
        <!-- parse fields                                       -->
        <field name="title" family="p" qualifier="t"/>
        <field name="text" family="p" qualifier="c"/>
        <field name="parseStatus" family="p" qualifier="st"/>
        <field name="signature" family="p" qualifier="sig"/>
        <field name="prevSignature" family="p" qualifier="psig"/>
        
        <!-- score fields                                       -->
        <field name="score" family="s" qualifier="s"/>
        <field name="headers" family="h"/>
        <field name="inlinks" family="il"/>
        <field name="outlinks" family="ol"/>
        <field name="metadata" family="mtdt"/>
        <field name="markers" family="mk"/>
    </class>
    
    <table name="host">
        <family name="mtdt" />
        <family name="il" />
        <family name="ol" />
    </table>
    
    <class table="host" keyClass="java.lang.String" name="org.apache.nutch.storage.Host">
        <field name="metadata" family="mtdt"/>
        <field name="inlinks" family="il"/>
        <field name="outlinks" family="ol"/>
    </class>
    
</gora-orm>

登录accumulo,查看webpage表结构:
./accumulo shell -u xxx -p xxx
root@inst> table webpage

1.在用户目录下创建名为urls的文件,加入一行:http://www.360buy.com/
执行./nutch inject ~/urls
root@inst webpage> scan -r com.360buy.www:http/
com.360buy.www:http/ f:fi []    \x00'\x8D\x00
com.360buy.www:http/ f:ts []    \x00\x00\x01:':\xA6\xE2
com.360buy.www:http/ mk:_injmrk_ []    y
com.360buy.www:http/ mtdt:_csh_ []    ?\x80\x00\x00
com.360buy.www:http/ s:s []    ?\x80\x00\x000

2. ./nutch generate
root@inst webpage> scan -r com.360buy.www:http/
com.360buy.www:http/ f:fi []    \x00'\x8D\x00
com.360buy.www:http/ f:ts []    \x00\x00\x01:':\xA6\xE2
com.360buy.www:http/ mk:_gnmrk_ []    1349277947-925721513
com.360buy.www:http/ mk:_injmrk_ []    y
com.360buy.www:http/ mtdt:_csh_ []    ?\x80\x00\x00
com.360buy.www:http/ s:s []    ?\x80\x00\x00

3. ./nutch fetch 1349277947-925721513
root@inst webpage> scan -r com.360buy.www:http/ -f 50
com.360buy.www:http/ f:bas []    http://www.360buy.com/
com.360buy.www:http/ f:cnt []    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Trans
com.360buy.www:http/ f:fi []    \x00'\x8D\x00
com.360buy.www:http/ f:prot []    \x02\x00\x00
com.360buy.www:http/ f:pts []    \x00\x00\x01:':\xA6\xE2
com.360buy.www:http/ f:st []    \x00\x00\x00\x02
com.360buy.www:http/ f:ts []    \x00\x00\x01:'<\x8C}
com.360buy.www:http/ f:typ []    application/xhtml+xml
com.360buy.www:http/ h:Cache-Control []    max-age=120
com.360buy.www:http/ h:Connection []    close
com.360buy.www:http/ h:Content-Encoding []    gzip
com.360buy.www:http/ h:Content-Location []    http://www.360buy.com/index.htm
com.360buy.www:http/ h:Content-Type []    text/html; charset=gb2312
com.360buy.www:http/ h:Date []    Wed, 03 Oct 2012 15:27:02 GMT
com.360buy.www:http/ h:Last-Modified []    Wed, 03 Oct 2012 15:25:57 GMT
com.360buy.www:http/ h:Server []    JDWS
com.360buy.www:http/ h:Vary []    Accept-Encoding
com.360buy.www:http/ h:X-Cache []    MISS from TJ-HY-CNC-CDN-55.360buy.com
com.360buy.www:http/ h:_ip []    125.39.96.182
com.360buy.www:http/ mk:_ftcmrk_ []    1349277947-925721513
com.360buy.www:http/ mk:_gnmrk_ []    1349277947-925721513
com.360buy.www:http/ mk:_injmrk_ []    y
com.360buy.www:http/ mtdt:_csh_ []    ?\x80\x00\x00
com.360buy.www:http/ s:s []    ?\x80\x00\x00


4. ./nutch parse 1349277947-925721513
root@inst webpage> scan -r com.360buy.www:http/ -f 50
com.360buy.www:http/ f:bas []    http://www.360buy.com/
com.360buy.www:http/ f:cnt []    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Trans
com.360buy.www:http/ f:fi []    \x00'\x8D\x00
com.360buy.www:http/ f:prot []    \x02\x00\x00
com.360buy.www:http/ f:pts []    \x00\x00\x01:':\xA6\xE2
com.360buy.www:http/ f:st []    \x00\x00\x00\x02
com.360buy.www:http/ f:ts []    \x00\x00\x01:'<\x8C}
com.360buy.www:http/ f:typ []    application/xhtml+xml
com.360buy.www:http/ h:Cache-Control []    max-age=120
com.360buy.www:http/ h:Connection []    close
com.360buy.www:http/ h:Content-Encoding []    gzip
com.360buy.www:http/ h:Content-Location []    http://www.360buy.com/index.htm
com.360buy.www:http/ h:Content-Type []    text/html; charset=gb2312
com.360buy.www:http/ h:Date []    Wed, 03 Oct 2012 15:27:02 GMT
com.360buy.www:http/ h:Last-Modified []    Wed, 03 Oct 2012 15:25:57 GMT
com.360buy.www:http/ h:Server []    JDWS
com.360buy.www:http/ h:Vary []    Accept-Encoding
com.360buy.www:http/ h:X-Cache []    MISS from TJ-HY-CNC-CDN-55.360buy.com
com.360buy.www:http/ h:_ip []    125.39.96.182
com.360buy.www:http/ mk:__prsmrk__ []    1349277947-925721513
com.360buy.www:http/ mk:_ftcmrk_ []    1349277947-925721513
com.360buy.www:http/ mk:_gnmrk_ []    1349277947-925721513
com.360buy.www:http/ mk:_injmrk_ []    y
com.360buy.www:http/ mtdt:_csh_ []    ?\x80\x00\x00
com.360buy.www:http/ ol:http://app.360buy.com/ []    \xE6\x89\x8B\xE6\x9C\xBA\xE4\xBA\xAC\xE4\xB8\x9C
---------------------------------------------- hit any key to continue or 'q' to quit ----------------------------------------------
com.360buy.www:http/ ol:http://book.360buy.com/ []    \xE5\x9B\xBE\xE4\xB9\xA6
com.360buy.www:http/ ol:http://caipiao.360buy.com/ []    \xE5\xBD\xA9\xE7\xA5\xA8
com.360buy.www:http/ ol:http://chat.360buy.com/jdchat/custom.action []    \xE5\x9C\xA8\xE7\xBA\xBF\xE5\xAE\xA2\xE6\x9C\x8D
com.360buy.www:http/ ol:http://chongzhi.360buy.com/ []    \xE5\x85\x85\xE5\x80\xBC
com.360buy.www:http/ ol:http://diy.360buy.com/ []    \xE8\xA3\x85\xE6\x9C\xBA\xE5\xA4\xA7\xE5\xB8\x88
com.360buy.www:http/ ol:http://e.360buy.com/index.html []    \xE7\x94\xB5\xE5\xAD\x90\xE4\xB9\xA6\xE5\x88\x8A
com.360buy.www:http/ ol:http://game.360buy.com/ []    \xE6\xB8\xB8\xE6\x88\x8F
com.360buy.www:http/ ol:http://help.360buy.com/ []    \xE5\xAE\xA2\xE6\x88\xB7\xE6\x9C\x8D\xE5\x8A\xA1
com.360buy.www:http/ ol:http://help.360buy.com/help/question-61.html []    \xE5\xB8\xB8\xE8\xA7\x81\xE9\x97\xAE\xE9\xA2\x98
com.360buy.www:http/ ol:http://home.360buy.com/ []    \xE6\x88\x91\xE7\x9A\x84\xE4\xBA\xAC\xE4\xB8\x9C
com.360buy.www:http/ ol:http://jd2008.360buy.com/JdHome/OrderList.aspx []    \xE6\x88\x91\xE7\x9A\x84\xE8\xAE\xA2\xE5\x8D\x95
com.360buy.www:http/ ol:http://jd2008.360buy.com/purchase/ShoppingCart.asp []    \xE5\x8E\xBB\xE8\xB4\xAD\xE7\x89\xA9\xE8\xBD\xA6\xE7\xBB\x93\xE7\xAE\x97
com.360buy.www:http/ ol:http://market.360buy.com/giftcard/ []    \xE7\xA4\xBC\xE5\x93\x81\xE5\x8D\xA1
com.360buy.www:http/ ol:http://market.360buy.com/giftcard/company/default. []    \xE4\xBC\x81\xE4\xB8\x9A\xE5\xAE\xA2\xE6\x88\xB7
com.360buy.www:http/ ol:http://mvd.360buy.com/ []    \xE9\x9F\xB3\xE5\x83\x8F
com.360buy.www:http/ ol:http://myjd.360buy.com/opinion/list.action []    \xE6\x8A\x95\xE8\xAF\x89\xE4\xB8\xAD\xE5\xBF\x83
com.360buy.www:http/ ol:http://myjd.360buy.com/repair/orderlist.action []    \xE5\x94\xAE\xE5\x90\x8E\xE6\x9C\x8D\xE5\x8A\xA1
com.360buy.www:http/ ol:http://read.360buy.com/ []    \xE5\x9C\xA8\xE7\xBA\xBF\xE8\xAF\xBB\xE4\xB9\xA6
com.360buy.www:http/ ol:http://sale.360buy.com/p10997.html []    \xE5\x8A\x9E\xE5\x85\xAC\xE7\x9B\xB4\xE9\x80\x9A\xE8\xBD\xA6
com.360buy.www:http/ ol:http://trip.360buy.com/ []    \xE6\x97\x85\xE8\xA1\x8C
com.360buy.www:http/ ol:http://www.360buy.com/ []    \xE9\xA6\x96\xE9\xA1\xB5
com.360buy.www:http/ ol:http://www.360buy.com/auto.html []    \xE6\xB1\xBD\xE8\xBD\xA6\xE7\x94\xA8\xE5\x93\x81
com.360buy.www:http/ ol:http://www.360buy.com/baby.html []    \xE6\xAF\x8D\xE5\xA9\xB4
com.360buy.www:http/ ol:http://www.360buy.com/bag.html []    \xE7\xA4\xBC\xE5\x93\x81\xE7\xAE\xB1\xE5\x8C\x85
---------------------------------------------- hit any key to continue or 'q' to quit ----------------------------------------------
com.360buy.www:http/ ol:http://www.360buy.com/beauty.html []    \xE4\xB8\xAA\xE6\x8A\xA4\xE5\x8C\x96\xE5\xA6\x86
com.360buy.www:http/ ol:http://www.360buy.com/clothing.html []    \xE6\x9C\x8D\xE9\xA5\xB0\xE9\x9E\x8B\xE5\xB8\xBD
com.360buy.www:http/ ol:http://www.360buy.com/computer.html []    \xE7\x94\xB5\xE8\x84\x91\xE3\x80\x81\xE5\x8A\x9E\xE5\x85\xAC
com.360buy.www:http/ ol:http://www.360buy.com/contact/service.html []    \xE5\xAE\xA2\xE6\x9C\x8D\xE9\x82\xAE\xE7\xAE\xB1
com.360buy.www:http/ ol:http://www.360buy.com/digital.html []    \xE6\x89\x8B\xE6\x9C\xBA\xE6\x95\xB0\xE7\xA0\x81
com.360buy.www:http/ ol:http://www.360buy.com/electronic.html []    \xE5\xAE\xB6\xE7\x94\xA8\xE7\x94\xB5\xE5\x99\xA8
com.360buy.www:http/ ol:http://www.360buy.com/food.html []    \xE9\xA3\x9F\xE5\x93\x81\xE9\xA5\xAE\xE6\x96\x99\xE3\x80\x81\xE4\xBF\x9D\xE5\x81\xA5\xE9\xA3\x9F\xE5\x93\x81
com.360buy.www:http/ ol:http://www.360buy.com/home.html []    \xE5\xAE\xB6\xE5\xB1\x85\xE5\xAE\xB6\xE8\xA3\x85
com.360buy.www:http/ ol:http://www.360buy.com/jewellery.html []    \xE7\x8F\xA0\xE5\xAE\x9D
com.360buy.www:http/ ol:http://www.360buy.com/kitchenware.html []    \xE5\x8E\xA8\xE5\x85\xB7
com.360buy.www:http/ ol:http://www.360buy.com/sports.html []    \xE8\xBF\x90\xE5\x8A\xA8\xE5\x81\xA5\xE5\xBA\xB7
com.360buy.www:http/ ol:http://www.360buy.com/toys.html []    \xE7\x8E\xA9\xE5\x85\xB7\xE4\xB9\x90\xE5\x99\xA8
com.360buy.www:http/ ol:http://www.360buy.com/watch.html []    \xE9\x92\x9F\xE8\xA1\xA8
com.360buy.www:http/ ol:http://www.360top.com/ []    360TOP \xE5\xA5\xA2\xE4\xBE\x88\xE5\x93\x81
com.360buy.www:http/ ol:http://www.ehaoyao.com/ []    \xE4\xBA\xAC\xE4\xB8\x9C \xE5\xA5\xBD\xE8\x8D\xAF\xE5\xB8\x88
com.360buy.www:http/ ol:http://www.minitiao.com/ []    \xE8\xBF\xB7\xE4\xBD\xA0\xE6\x8C\x91
com.360buy.www:http/ ol:http://xiaoyuan.360buy.com/ []    \xE6\xA0\xA1\xE5\x9B\xAD\xE9\xA2\x91\xE9\x81\x93
com.360buy.www:http/ p:c []    \xE4\xBA\xAC\xE4\xB8\x9C\xE7\xBD\x91\xE4\xB8\x8A\xE5\x95\x86\xE5\x9F\x8E-\xE7\xBB\xBC\xE5\x90\x88\xE7\xBD\x91\xE8\xB4\xAD\xE9\xA6\x96\xE9\x80\x89\xEF\xBC\x8C\xE6\xAD\xA3\xE5\x93\x81\xE8\xA1\x8C\xE8
com.360buy.www:http/ p:sig []    HNC\xF3\x87\xEF\x8E\xD1mB\xE4\xE3\xA2\xA3\x1D\xEA
com.360buy.www:http/ p:st []    \x02\x00\x00
com.360buy.www:http/ p:t []    \xE4\xBA\xAC\xE4\xB8\x9C\xE7\xBD\x91\xE4\xB8\x8A\xE5\x95\x86\xE5\x9F\x8E-\xE7\xBB\xBC\xE5\x90\x88\xE7\xBD\x91\xE8\xB4\xAD\xE9\xA6\x96\xE9\x80\x89\xEF\xBC\x8C\xE6\xAD\xA3\xE5\x93\x81\xE8\xA1\x8C\xE8
com.360buy.www:http/ s:s []    ?\x80\x00\x00


5. ./nutch updatedb



6../nutch solrindex http://localhost:8983/solr/ 1349277947-925721513
./nutch solrindex http://localhost:8983/solr/ -reindex
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics