Skip to content

Latest commit

 

History

History
432 lines (306 loc) · 18.6 KB

File metadata and controls

432 lines (306 loc) · 18.6 KB

Filtering

The pdf files presented here are not accessible in the dataset for copyright reasons.

Preparation

# 00 Preparation ###############################################################

# 0.1 Prepare folder ===========================================================

# Folder check
getwd()

# Set folder
setwd()

# Clean up workspace
rm(list = ls(all.names = TRUE))

# 0.2 Load packages ============================================================
install.packages("pacman")
library(pacman) #Easier way of loading packages
pacman::p_load(dplyr, tm, tidyr, tibble, stringr, readr) # Specify required packages and download it if needed

# 0.3 Session info =============================================================

sessionInfo()
R version 4.5.1 (2025-06-13 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 11 x64 (build 26100)

Matrix products: default
  LAPACK version 3.12.1

locale:
[1] LC_COLLATE=French_France.utf8  LC_CTYPE=French_France.utf8    LC_MONETARY=French_France.utf8 LC_NUMERIC=C                  
[5] LC_TIME=French_France.utf8    

time zone: Europe/Paris
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices datasets  utils     methods   base     

other attached packages:
[1] readr_2.1.5   stringr_1.5.1 tibble_3.3.0  tidyr_1.3.1   tm_0.7-16     NLP_0.3-2     dplyr_1.1.4  

loaded via a namespace (and not attached):
 [1] vctrs_0.6.5      knitr_1.50       slam_0.1-55      cli_3.6.5        xfun_0.52        rlang_1.1.6      stringi_1.8.7    purrr_1.1.0      renv_1.0.11     
[10] generics_0.1.4   glue_1.8.0       hms_1.1.3        evaluate_1.0.4   tzdb_0.5.0       lifecycle_1.0.4  compiler_4.5.1   pacman_0.5.1     Rcpp_1.1.0      
[19] pkgconfig_2.0.3  R6_2.6.1         tidyselect_1.2.1 pillar_1.11.0    parallel_4.5.1   magrittr_2.0.3   tools_4.5.1      xml2_1.3.8    

Convert PDF to .TXT

You can download the xpdf app at the following link. Version xpdf.4.05 was used.

# 01 Convert PDF ###############################################################
# 01.1 Select the path and the files ===========================================
path  <- "./data/pdf_full" #Select the location of PDFs files
files <- list.files(path = "./data/pdf_full/", pattern = "pdf")  #Make a vector of PDFs in the folder files not included

xpdf <- "C:/Program Files/xpdf-tools-win-4.05/bin64/pdftotext.exe"  #Path for XPDF tool accessible at (https://www.xpdfreader.com/download.html)


# 01.2 Transform and export in the same folder as the pdf ======================
for (i in 1:length(files)){
  pdf <- file.path(path, files[i])
  system(paste("\"",xpdf,"\" \"", pdf, "\"", sep =""), wait = FALSE)
  }

rm(list=ls())

Prepare the text files

# 02 Prepare the txt files #####################################################
# 02.1 Import the data under Corpus file =======================================
corpus <- VCorpus(DirSource(directory = "./data/txt_full/",
                            pattern = ".txt"))  #Corpus from the package files not included

# 02.2 Clean the elements ======================================================
df <- corpus

# Function to remove element and replace it by blank space
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))

# Remove some characters
df <- tm_map(df, toSpace, "/")
df <- tm_map(df, toSpace, "@")
df <- tm_map(df, toSpace, "\\|")

# Put all tall letters to smaller
df <- tm_map(df, content_transformer(tolower)) 

# Remove numbers
df <- tm_map(df, removeNumbers)
df <- tm_map(df, removePunctuation)
df <- tm_map(df, stripWhitespace)

# 02.3 Put into data frame and export ==========================================

# Create a data frame with eh text and the ID of text(file name)
data <- data.frame(id=sapply(df, meta, "id"),
                     text=unlist(lapply(sapply(df, '[', "content"),paste,collapse="\n")),
                     stringsAsFactors=FALSE) 

# Remove page number
data$text <- gsub("[\r\n]", "", data$text)

# Check the first element
data$text[1]
write.table(data, "./export/pre_process/export_data.txt", sep = ";", row.names = FALSE , col.names = FALSE)

# 02.4 Remove reference part ===================================================

# For separating the different part of the file and removing the "reference part"
remove_references <- function(x){
  z <- as.data.frame(x[,1]) 
  z$text <-x[,2]
  z$check <- grepl("introduction", z$text)    # Look for introduction word
  
  z$number <- ifelse(z$check == TRUE, z$number <- 3 , z$number <- 0)
  
  y <- z
  y$check <- grepl("abstract", y$text)    # Look for abstract word
  y$number <- ifelse(y$check == TRUE, y$number <- 2, y$number <- 0 )
  z$number <- ifelse(z$number == 3, z$number <- 3, z$number <- y$number)
  
  y$check <- grepl("doi", y$text)     # Look for doi mention
  y$number <- ifelse(y$check == TRUE, y$number <- 1, y$number <- 0 )
  z$number <- ifelse(z$number > 0, z$number <- z$number, z$number <- y$number)
  
  # Separate
  z$intro <- ifelse(z$number == 3, z$intro <- regmatches(z$text, gregexpr("(?<=introduction).*?(?=references)", z$text, perl=TRUE)), z$intro <- NA)
  z$abs <- ifelse(z$number >= 2, z$abs <- regmatches(z$text, gregexpr("(?<=abstract).*?(?=references)", z$text, perl=TRUE)), z$abs <- NA)
  z$doi <- ifelse(z$number >= 1, z$doi <- regmatches(z$text, gregexpr("(?<=doi).*?(?=references)", z$text, perl=TRUE)), z$doi <- NA)
 return(z)
}

clean <-  remove_references(data)

# Remove the ".pdf" old information to remove any issues with further analysis
clean[,1]<- gsub(".pdf", "", clean[,1]) 
colnames(clean) <- c("Id", "text", "check", "number", "intro",  "abs", "doi")

# 02.5 Export the elements =====================================================
save(list = c("clean", "corpus"), file = "./export/pre_process/Pre-process.RData")

rm(list=ls())

Combine the metadata and other elements

The collected information on the corpus can be found as a export_data.txt and Pre-process.RData files.

# 03 Combine the different element #############################################
load("./export/pre_process/Pre-process.RData")

intro <- subset(clean, clean$number == 3)
abs <- subset(clean, clean$number == 2)
doi <- subset(clean, clean$number == 1)

# Text can be seen as full but only filled with "character(0)" string

# 03.1 Clean the Introduction selection ========================================
intro$char <- intro$intro == "character(0)"  
intro_final <- subset(intro, char == FALSE)

intro <- subset(intro, char == TRUE)
intro$char <- intro$abs == "character(0)"
abs_final <- subset(intro, char == FALSE)
abs_final$number <- 2

intro <- subset(intro, char == TRUE)
intro$char <- intro$doi == "character(0)"
doi_final <- subset(intro, char == FALSE)
doi_final$number <- 1

no_resume <- subset(intro, char == TRUE)
no_resume$number <- 0

# 03.2 Clean the abstract selection ============================================
abs$char <- abs$abs == "character(0)"
abs_final2 <- subset(abs, char == FALSE)

abs <- subset(abs, char == TRUE)
abs$char <- abs$doi == "character(0)"
doi_final2 <- subset(abs, char == FALSE)
doi_final2$number <- 1

no_resume2 <- subset(abs, char == TRUE)
no_resume2$number <- 0

# 03.3 Clean the selection without abstract and introduction ===================
doi$char <- doi$doi == "character(0)"
doi_final3 <- subset(doi, char == FALSE)

no_resume3 <- subset(doi, char == TRUE)
no_resume3$number <- 0

# 03.4 Merge all the data ======================================================

names <- c("Id", "Full_text", "Number","Removed_ref")

intro <- intro_final[,c(1:2,4:5)] #merge the introduction selections 
colnames(intro) <- names

abs <- rbind(abs_final, abs_final2) #merge the abstract selections 
abs <- abs [,c(1:2,4,6)]
colnames(abs) <- names

doi <- rbind(doi_final, doi_final2, doi_final3) #merge the selections without abstract and introduction
doi <- doi [,c(1:2,4,7)]
colnames(doi) <- names

# Not_treated data have to analyzed again to see if "abstract" or "introduction" word are not embedded with others which can make the greplx function inefficient
# Once check and modified if necessary (remove the reference part manually) you would have to upload again later in the steps.
not_treated <- rbind(no_resume, no_resume2 , no_resume3)
not_treated <- not_treated [,c(1:2,4)]

no_doi <- subset(clean, is.na(clean$doi) == TRUE) # Look if a references part as not been spltited from an article
no_doi <- no_doi [,c(1:2,4)]
not_treated <- rbind(not_treated, no_doi)
colnames(not_treated) <- c ("Id", "Full_text", "Number") 

final_data <- rbind(intro, abs, doi) #Combine all the data in one data frame


write.table(not_treated$Id, "./export/pre_process/not_treated.txt", sep = ";", row.names = FALSE , col.names = FALSE)

# 03.5 Add the articles with embedded or no references part =====================
# Repeat the steps 2 until 2.4 step. for the "not_treated" articles once the references have been removed manually
save(list = c("clean", "corpus", "final_data", "not_treated"), file = "./export/pre_process/Filtered_references.RData")

rm(list=ls())

First filter

The Metadata were collected from the information available from the DOI and imports from the different web libraries.

# 04 First filter ##############################################################
load("./export/pre_process/Filtered_references.RData")

# 04.1 Import the metadata (Exported from the RIS file in Zotero in CSV format) 
Metadata <- read_delim("./data/Metadata.csv", delim =";", na = "NA") #File not included in the data set
Metadata$File.Attachments[Metadata$File.Attachments == ""] <-NA
Metadata <- Metadata %>% drop_na(File.Attachments) #Remove the ones without Pdfs

You need to verify that your FileAttachments columns is well written only with the file name without anything else

# 04.2 Filter only Articles from scientific journals ============================
Metadata <- subset(Metadata, Metadata$Item.Type == "journalArticle") #Select only journal articles
final_data <- subset(final_data, final_data$Id %in% Metadata$File.Attachments)#Select in the data set journal articles

write_delim(Metadata, "./export/pre_process/Metadata_clean.csv", delim = ";")

# 04.3 Function for filtering the data =========================================
filter_func <- function(x, z){
  
  list_archeo <- c("archaeology", "archeology", "archaeological", "archeological") #List of archaeological words related
  list_ML <- c("machine learning", "ml", "deep learning", "artificial intelligence") #List of ML words related
  
  #Filter for the archaeological words
  x$check <- grepl(list_archeo[1], z)
  x$number <- ifelse(x$check == TRUE, x$number <- 1 , x$number <- 0)
  x$archeo_value <- x$number
  
  x$check <- grepl(list_archeo[2], z)
  x$number <- ifelse(x$check == TRUE, x$number <- 1 , x$number <- 0)
  x$archeo_value <- x$archeo_value + x$number
  
  x$check <- grepl(list_archeo[3], z)
  x$number <- ifelse(x$check == TRUE, x$number <- 1 , x$number <- 0)
  x$archeo_value <- x$archeo_value + x$number
  
  x$check <- grepl(list_archeo[4], z)
  x$number <- ifelse(x$check == TRUE, x$number <- 1 , x$number <- 0)
  x$archeo_value <- x$archeo_value + x$number

  
  #Filter for the machine learning words
  x$check <- grepl(list_ML[1], z)
  x$number <- ifelse(x$check == TRUE, x$number <- 1 , x$number <- 0)
  x$ML_value <- x$number
  
  x$check <- grepl(list_ML[2], z)
  x$number <- ifelse(x$check == TRUE, x$number <- 1 , x$number <- 0)
  x$ML_value <- x$ML_value + x$number
  
  x$check <- grepl(list_ML[3], z)
  x$number <- ifelse(x$check == TRUE, x$number <- 1 , x$number <- 0)
  x$ML_value <- x$ML_value + x$number
  
  x$check <- grepl(list_ML[4], z)
  x$number <- ifelse(x$check == TRUE, x$number <- 1 , x$number <- 0)
  x$ML_value <- x$ML_value + x$number

  x$filtered <- ifelse(x$archeo_value > 0 & x$ML_value > 0, x$filtered <- 1, x$filtered <- 0)  

  return(x)
}

# 04.4 Apply the filtering to the articles without references part =============
first_filtered_data <- filter_func(final_data, final_data$Removed_ref)

# 04.5 Subset the different results ============================================
# Subset the article without archeo keywords
no_archeo <- subset(first_filtered_data, first_filtered_data$archeo_value == 0)

# Subset the article without ML keywords
no_ML <- subset(first_filtered_data, first_filtered_data$ML_value == 0)

# Subset the article without either archeo or ML keywords
no_combined <- subset(first_filtered_data, first_filtered_data$filtered == 0)

# Subset the article with archeo AND ML keywords
first_filtered_data <- subset(first_filtered_data, first_filtered_data$filtered == 1)

# 04.6 Export the resuts =======================================================
write.table(no_archeo[1], "./export/first_filter/first_filter_no_archeo.csv", sep = "\t", row.names = FALSE , col.names = FALSE)
write.table(no_ML[1], "./export/first_filter/first_filter_no_ML.csv", sep = "\t", row.names = FALSE , col.names = FALSE)
write.table(no_combined[1], "./export/first_filter/first_filter_no_combined.csv", sep = "\t", row.names = FALSE , col.names = FALSE)
write.table(first_filtered_data[1], "./export/first_filter/first_filtered.csv", sep = "\t", row.names = FALSE , col.names = FALSE)

save(list = c("final_data", "filter_func", "first_filtered_data","no_combined", "Metadata"), file = "./export/first_filter/First_filter.RData")
rm(list=c(ls()))

Second filter

# 05 Second filter #############################################################
load("./export/first_filter/First_filter.RData")

# 05.1 Filter the metadata =====================================================
Metadata_first_filter <- subset(Metadata, Metadata$File.Attachments %in% first_filtered_data$Id)  #Export the metadata from the first filter
write_delim(Metadata_first_filter, "./export/first_filter/Metadata_first_filter.csv", delim = ";")

# 05.2 Merge metadata and first filtered ones ==================================
second_filtered_data <- merge(first_filtered_data, Metadata, by.x = "Id", by.y = "File.Attachments")

# Only select articles with an abstract
second_filtered_data$Abstract.Note[second_filtered_data$Abstract.Note== ""] <-NA
second_filtered_data <- subset(second_filtered_data, is.na(second_filtered_data$Abstract.Note) == FALSE) 

# 05.3 Filter the abstract =====================================================
x <- second_filtered_data
z <- x$Abstract.Note

# Function filtering the data for the abstract
abstract_filtered_data <- filter_func(x, z)

# Subset the article without archeo keywords
no_archeo_abstract <- subset(abstract_filtered_data, abstract_filtered_data$archeo_value == 0)

# Subset the article without ML keywords
no_ML_abstract <- subset(abstract_filtered_data, abstract_filtered_data$ML_value == 0)

# Subset the article without either archeo or ML keywords
no_combined_abstract <- subset(abstract_filtered_data, abstract_filtered_data$filtered == 0)

# Subset the article with archeo AND ML keywords
abstract_filtered_data <- subset(abstract_filtered_data, abstract_filtered_data$filtered == 1)

# 05.4 Export the results ======================================================
write.table(no_archeo_abstract[1], "./export/second_filter/Abstract_filtered_no_archeo.csv", sep = "\t", row.names = FALSE , col.names = FALSE)
write.table(no_ML_abstract[1], "./export/second_filter/Abstract_filtered_no_ML.csv", sep = "\t", row.names = FALSE , col.names = FALSE)
write.table(no_combined_abstract[1], "./export/second_filter/Abstract_filtered_no_combined.csv", sep = "\t", row.names = FALSE , col.names = FALSE)
write.table(abstract_filtered_data[1], "./export/second_filter/Abstract_filtered.csv", sep = "\t", row.names = FALSE , col.names = FALSE)

save(list = c("no_archeo_abstract", "filter_func",  "no_combined_abstract", "no_ML_abstract", "abstract_filtered_data"), file = "./export/second_filter/Abstract_filtered.RData")

rm(list=c("no_archeo_abstract", "no_combined_abstract", "no_ML_abstract"))

# 05.5 Filter the title ========================================================
x <- second_filtered_data
z <- x$Title

# Function filtering the data ==================================================
title_filtered_data <- filter_func(x, z)

# Subset the article without archeo keywords
no_archeo_title <- subset(title_filtered_data, title_filtered_data$archeo_value == 0)

# Subset the article without ML keywords
no_ML_title <- subset(title_filtered_data, title_filtered_data$ML_value == 0)

# Subset the article without either archeo or ML keywords
no_combined_title <- subset(title_filtered_data, title_filtered_data$filtered == 0)

# Subset the article with archeo AND ML keywords
title_filtered_data <- subset(title_filtered_data, title_filtered_data$filtered == 1)

# 05.6 Export the results ======================================================
write.table(no_archeo_title[1], "./export/second_filter/Title_filtered_no_archeo.csv", sep = "\t", row.names = FALSE , col.names = FALSE)
write.table(no_ML_title[1], "./export/second_filter/Title_filtered_no_ML.csv", sep = "\t", row.names = FALSE , col.names = FALSE)
write.table(no_combined_title[1], "./export/second_filter/Title_filtered_no_combined.csv", sep = "\t", row.names = FALSE , col.names = FALSE)
write.table(title_filtered_data[1], "./export/second_filter/Title_filtered.csv", sep = "\t", row.names = FALSE , col.names = FALSE)
save(list = c("no_archeo_title", "no_ML_title", "no_combined_title", "title_filtered_data"), file = "./export/second_filter/Title_filtered.RData")

rm(list=c("no_archeo_title", "no_ML_title", "no_combined_title","x","z"))

# 05.7 Combine the title and abstract filters ==================================
second_filtered_data <- rbind(title_filtered_data, abstract_filtered_data)
second_filtered_data <- second_filtered_data[c(1:5)]
second_filtered_data <- distinct(second_filtered_data) #Remove duplicates

write.table(second_filtered_data[1], "./export/second_filter/second_filtered.txt", sep = ";", row.names = FALSE , col.names = FALSE)

# 05.8 Export the metadata with the second filter ==============================
Metadata_second_filter <- subset(Metadata, Metadata$File.Attachments %in% second_filtered_data$Id)  #Export the metadata from the first filter
write_delim(Metadata_second_filter, "./export/second_filter/Metadata_second_filter.csv", delim = ";")

save(list = c("second_filtered_data", "Metadata_second_filter"), file = "./export/second_filter/Final_data_automatic_process.RData")