It is not that it is reading the same url. It is that you are selecting for the wrong node which happens to yield repeating info. As I mentioned in your last question, you need to re-work your Title
function. The Title
re-write below will extract the actual article title based on class name and single node match.
Please note the removal of your sep
arg. There are also some other areas of the code that look like they probably could be simplified in terms of logic.
Title function:
Title <- function(parsedDocument) {
Title <- parsedDocument %>%
html_node(".article-title-main") %>%
html_text() %>%
gsub("\r\n\s+", "", .) %>%
trimws(.)
return(Title)
}
R:
library(rvest)
library(XML)
library(stringr)
# Getting the number of Page
getPageNumber <- function(URL) {
# print(URL)
parsedDocument <- read_html(URL)
Sort1 <- html_nodes(parsedDocument, "div")
Sort2 <- Sort1[which(html_attr(Sort1, "class") == "pagination al-pagination")]
P <- str_count(html_text(Sort2), pattern = " \d+
")
return(ifelse(length(P) == 0, 0, max(P)))
}
# Getting all articles based off of their DOI
getAllArticles <- function(URL) {
print(URL)
parsedDocument <- read_html(URL)
Sort1 <- html_nodes(parsedDocument, "div")
Sort2 <- Sort1[which(html_attr(Sort1, "class") == "al-citation-list")]
ArticleDOInumber <- trimws(gsub(".*10.1093/dnares/", "", html_text(Sort2)))
URL3 <- "https://doi.org/10.1093/dnares/"
URL4 <- paste(URL3, ArticleDOInumber, sep = "")
return(URL4)
}
Title <- function(parsedDocument) {
Title <- parsedDocument %>%
html_node(".article-title-main") %>%
html_text() %>%
gsub("\r\n\s+", "", .) %>%
trimws(.)
return(Title)
}
# main function with input as parameter year
findURL <- function(year_chosen) {
if (year_chosen >= 1994) {
noYearURL <- glue::glue("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F{year_chosen}%20TO%2012%2F31%2F{year_chosen}")
pagesURl <- "&fl_SiteID=5275&page="
URL <- paste(noYearURL, pagesURl, sep = "")
# URL is working with parameter year_chosen
Page <- getPageNumber(URL)
if (Page == 5) {
Page2 <- 0
while (Page < Page2 | Page != Page2) {
Page <- Page2
URL3 <- paste(URL, Page - 1, sep = "")
Page2 <- getPageNumber(URL3)
}
}
R_Data <- data.frame()
for (i in 1:Page) {
URL2 <- getAllArticles(paste(URL, i, sep = ""))
for (j in 1:(length(URL2))) {
parsedDocument <- read_html(URL2[j])
#print(URL2[j])
#print(Title(parsedDocument))
R <- data.frame("Title" = Title(parsedDocument), stringsAsFactors = FALSE)
#print(R)
R_Data <- rbind(R_Data, R)
}
}
write.csv(R_Data, "Group4.csv", row.names = FALSE)
} else {
print("The Year you provide is out of range, this journal only contain articles from 2005 to present")
}
}
findURL(2003)
与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…