爬取植物志海拔
library(dplyr)
library(rjson)
library(RCurl)
library(stringr)
get_alt <- function(sci_name){
url <- paste0("<http://www.iplant.cn/ashx/getfrps.ashx?key=>",sci_name)
url <- URLencode( iconv(url, to = 'utf8', toRaw = F ) )
json <- getURL(url)
if(json!=""){
geo <- fromJSON(json)
str_extract(geo$frpsdesc,"(?<=海拔).+?(?=米)")
} else{
"NA"
}
}
data <- roi::import("F:\\\\YR\\\\青藏高原保护植物\\\\表格数据\\\\青藏高原保护植物总表.xlsx")
result <- data %>% rowwise() %>% mutate(altitude =get_alt(学名))
export(result, "F:\\\\YR\\\\青藏高原保护植物\\\\表格数据\\\\青藏高原保护植物总表_带海拔.xlsx")
爬取CVH青藏高原所有县级行政区标本
library(httr)
library(tidyverse)
library(rvest)
library(xml2)
headers = c("accept" = "application/json, text/javascript, */*; q=0.01",
"accept-encoding" = "gzip, deflate, br",
"accept-language" = "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6
cookie: _ga=GA1.3.2100900702.1676370533; _pk_id.1.2cf1=caf23ac63d83aed8.1676370533.; _pk_id.43.2cf1=62aeda5b2301e5e6.1676370533.; _gid=GA1.3.1121927377.1679227930; PHPSESSID=uq9a3rj8vrrdo7p5ve2dlibisj; _pk_ses.1.2cf1=1; _pk_ses.43.2cf1=1",
"Referer" = "<https://www.cvh.ac.cn/spms/list.php?&stateProvince[]=%E8%A5%BF%E8%97%8F%E8%87%AA%E6%B2%BB%E5%8C%BA!&offset=0>",
"user-agent" = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44",
"x-requested-with" = "XMLHttpRequest"
)
county = read.table("data/TP_县域.txt",header = T,sep = ",")
for (i in 1:nrow(county)) {
link = paste0("<https://www.cvh.ac.cn/controller/spms/list.php?&county=>", county$NAME[i])
total = GET(url = link, add_headers(.headers = headers)) %>% content() %>% .[["total"]]
page = ceiling(total/30)
df = tibble(url = paste0("<https://www.cvh.ac.cn/controller/spms/list.php?&county=>", county$NAME[i], "&offset=",30*0:page))
fun = function(url){
GET(url = url, add_headers(.headers = headers)) %>% content() %>% .[["rows"]] %>% data.table::rbindlist(.) %>% tibble()
}
data = df$url %>% map_dfr(fun)
writexl::write_xlsx(data, path = paste0("E:/石老师/2024-07-16爬取青藏高原所有县CHV标本/",county$NAME[i],".xlsx"))
print(county$NAME[i])
}
# 对于市级名字不能带市
county2 = data.frame(NAME = c("康定市", "马尔康市", "香格里拉市", "昌都市", "玉树市")) %>%
mutate(NAME2 = str_replace(NAME,"市",""))
for (i in 1:nrow(county2)) {
link = paste0("<https://www.cvh.ac.cn/controller/spms/list.php?&county=>", county2$NAME2[i])
total = GET(url = link, add_headers(.headers = headers)) %>% content() %>% .[["total"]]
page = ceiling(total/30)
df = tibble(url = paste0("<https://www.cvh.ac.cn/controller/spms/list.php?&county=>", county2$NAME2[i], "&offset=",30*0:page))
fun = function(url){
GET(url = url, add_headers(.headers = headers)) %>% content() %>% .[["rows"]] %>% data.table::rbindlist(.) %>% tibble()
}
data = df$url %>% map_dfr(fun)
writexl::write_xlsx(data, path = paste0("E:/石老师/2024-07-16爬取青藏高原所有县CHV标本/",county2$NAME[i],".xlsx"))
print(county2$NAME[i])
}
library(httr)
library(tidyverse)
library(rvest)
library(xml2)
headers = c("accept" = "application/json, text/javascript, */*; q=0.01",
"accept-encoding" = "gzip, deflate, br",
"accept-language" = "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6
cookie: _ga=GA1.3.2100900702.1676370533; _pk_id.1.2cf1=caf23ac63d83aed8.1676370533.; _pk_id.43.2cf1=62aeda5b2301e5e6.1676370533.; _gid=GA1.3.1121927377.1679227930; PHPSESSID=uq9a3rj8vrrdo7p5ve2dlibisj; _pk_ses.1.2cf1=1; _pk_ses.43.2cf1=1",
"referer" = "<https://www.cvh.ac.cn/spms/list.php?&hasFlower=true&offset=30>",
"user-agent" = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44",
"x-requested-with" = "XMLHttpRequest"
)
url1 = "<https://www.cvh.ac.cn/controller/spms/list.php?&hasFlower=true&offset=30&limit=100>"
result1= GET(url = url1, add_headers(.headers = headers)) %>% content() %>% .[["rows"]]%>% data.table::rbindlist(.) %>% tibble()
url = df$url[1]
fun = function(url2){
GET(url = url2, add_headers(.headers = headers)) %>% content() %>% .[["rows"]] %>% as.tibble()
}
df = tibble(url = paste0("<https://www.cvh.ac.cn/controller/spms/detail.php?id=>", result1$collectionID))
# apply(df[1,], 1, fun)
df$url %>% map_dfr(fun)
爬取植物志产地生境海拔信息
library(tidyverse)
library(rvest)
getinfo = function(sciname) {
url = paste0("<https://www.iplant.cn/ashx/getfrps.ashx?key=>", URLencode(sciname))
result = tryCatch({
# 尝试读取网页
page <- read_html(url) %>% html_nodes(css = "body > div") %>% html_text() %>% str_extract(".*产.*|.*分布.*") %>% na.omit() %>% .[[1]] %>% as.tibble()
}, error = function(e) {
# 如果出现错误,打印错误信息并返回NULL
print(paste("Caught an error:", e))
page = tibble(NA)
})
}
data = readxl::read_excel("G:/研究生竞赛及项目/2023-05-12澜沧江省级自然保护区/植物名录/区系数据/澜沧江自然保护区植物名录-最终.xlsx", sheet = "总表")
result = data$canonical_name%>% map_dfr(getinfo)
result = result %>% select(2) %>% set_names("information")
result2 = data %>% bind_cols(result)
result2 %>% writexl::write_xlsx("G:/研究生竞赛及项目/2023-05-12澜沧江省级自然保护区/植物名录/区系数据/澜沧江自然保护区植物名录-最终-带生境海拔信息.xlsx")
library(httr)
library(RCurl)
library(jsonlite)
library(rvest)
data = readxl::read_excel("I:/石老师/2024-11-16-爬取分布地/植物界-2024-47474.xlsx")
# 失败则重试
retry = function(expr, retries=3, wait = 1) {
for (i in 1:retries) {
result = try(eval(expr), silent=TRUE)
if (!inherits(result, "try-error")) return(result)
Sys.sleep(wait)
}
stop("重试失败")
}
while (j <= nrow(data)) {
url = paste0("<https://www.iplant.cn/cvh/ashx/spmapplacegroup2024.ashx?callbackparam=jQuery1111037633911942499454_1731761718489&tr=0.5043297786549452&key=>", data$物种拉丁名[j])
url = URLencode(url)
json = retry(expr = {getURI(url)}, retries = 8, wait = 1)
if (nchar(json) > 1 ) {
# 去掉外部包裹的非JSON部分
clean_json = sub("^.*\\\\(", "", json) # 移除开头部分
clean_json = sub("\\\\)$", "", clean_json) # 移除结尾部分
parsed_data = fromJSON(clean_json)
if (length(parsed_data$mplace) != 0){
# 解析JSON
split_list = strsplit(parsed_data$mplace, ",")[[1]]
location = split_list[str_detect(split_list, "四川")]
print(paste(j, location))
if (length(location) != 0 ){data$location[j] = location}
}
}
j=j+1
}
writexl::write_xlsx(data, "I:/石老师/2024-11-16-爬取分布地/植物界_distribution.xlsx")