官方文档:Documentation

Installation

一些实例

  1. 提取PPBC杜鹃兰所有图片信息(滚动页面)
{"_id":"plant","startUrl":["<http://ppbc.iplant.cn/sp/38791"],"selectors":[{"id":"scroll","parentSelectors":["_root"],"type":"SelectorElementScroll","selector":"div.title","multiple":true,"delay":2000,"elementLimit":0},{"id":"name","parentSelectors":["scroll"],"type":"SelectorText","selector":".namew> a","multiple":false,"regex":""},{"id":"address","parentSelectors":["scroll"],"type":"SelectorText","selector":"span","multiple":false,"regex":""}]}
  1. 提取CVH中国植物标本馆名录列表(页面部分加载)
{"_id":"cvh_herbaria","startUrl":["<https://www.cvh.ac.cn/topics/herbaria.php"],"selectors":[{"id":"page","parentSelectors":["_root","page"],"paginationType":"clickMore","selector":".page-next> a","type":"SelectorPagination"},{"id":"table","parentSelectors":["page"],"type":"SelectorTable","multiple":true,"selector":"table.table","tableDataRowSelector":"tbody tr","tableHeaderRowSelector":"thead tr","columns":[{"extract":true,"header":"代码","name":"代  码"},{"extract":true,"header":"标本馆","name":"标本馆"},{"extract":true,"header":"所在地","name":"所在地"},{"extract":true,"header":"联系人","name":"联系人"},{"extract":true,"header":"更新时间","name":"更新时间"}]}]}
  1. 提取CVH成都市全部标本
{"_id":"cvh_scrape","startUrl":["<https://www.cvh.ac.cn/spms/list.php?&county=%E6%88%90%E9%83%BD%E5%B8%82&offset=[0-5499:30]"],"selectors":[{"id":"table","parentSelectors":["_root"],"type":"SelectorTable","multiple":true,"selector":"table","tableDataRowSelector":"tr.spms-row","tableHeaderRowSelector":"thead> tr","columns":[{"extract":true,"header":"馆藏条码","name":"馆藏条码"},{"extract":true,"header":"中文名/学名","name":"中文名/学名"},{"extract":true,"header":"采集人/采集号","name":"采集人/采集号"},{"extract":true,"header":"采集地","name":"采集地"},{"extract":true,"header":"采集年份","name":"采集年份"}]}]}
  1. 提取中国外来入侵植物名录
{"_id":"invasive_species","startUrl":["<http://www.iplant.cn/ias/protlist?page=[1-41]"],"selectors":[{"id":"table","parentSelectors":["_root"],"type":"SelectorTable","multiple":true,"selector":"table","tableDataRowSelector":"tr:nth-of-type(n+2)","tableHeaderRowSelector":"tr:nth-of-type(1)","columns":[{"extract":true,"header":"中文名","name":"中文名"},{"extract":true,"header":"拉丁名","name":"拉丁名"},{"extract":true,"header":"俗名","name":"俗> 名"},{"extract":true,"header":"科名","name":"科 名"},{"extract":true,"header":"级别","name":"级 别"},{"extract":true,"header":"原产地","name":"原产地"},{"extract":true,"header":"详细","name":"详 细"}]}]}
  1. 提取PPBC某地区(高黎贡山)全部图片植物信息

使用正则表达式,并提取成多列,要在第一级选择方法选择SelectorElement,不要选text

{"_id":"pbcc_adress_plant","startUrl":["<http://ppbc.iplant.cn/list21?page=[1-44]&sel=like&didian=%E9%AB%98%E9%BB%8E%E8%B4%A1%E5%B1%B1"],"selectors":[{"id":"data","multiple":true,"parentSelectors":["_root"],"selector":"div.title","type":"SelectorElement"},{"id":"chinese_name","multiple":false,"parentSelectors":["data"],"regex":"[一-龥]+","selector":".namew> a","type":"SelectorText"},{"id":"science_name","multiple":false,"parentSelectors":["data"],"regex":"[a-z].*[a-z]","selector":".namew a","type":"SelectorText"},{"id":"adress","multiple":false,"parentSelectors":["data"],"regex":"","selector":"> span","type":"SelectorText"}]}