Filtering

The pdf files presented here are not accessible in the dataset for copyright reasons.

Preparation

# 00 Preparation ###############################################################

# 0.1 Prepare folder ===========================================================

# Folder check
getwd()

# Set folder
setwd()

# Clean up workspace
rm(list = ls(all.names = TRUE))

# 0.2 Load packages ============================================================
install.packages("pacman")
library(pacman) #Easier way of loading packages
pacman::p_load(dplyr, tm, tidyr, tibble, stringr, readr) # Specify required packages and download it if needed

# 0.3 Session info =============================================================

sessionInfo()

R version 4.5.1 (2025-06-13 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 11 x64 (build 26100)

Matrix products: default
  LAPACK version 3.12.1

locale:
[1] LC_COLLATE=French_France.utf8  LC_CTYPE=French_France.utf8    LC_MONETARY=French_France.utf8 LC_NUMERIC=C                  
[5] LC_TIME=French_France.utf8    

time zone: Europe/Paris
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices datasets  utils     methods   base     

other attached packages:
[1] readr_2.1.5   stringr_1.5.1 tibble_3.3.0  tidyr_1.3.1   tm_0.7-16     NLP_0.3-2     dplyr_1.1.4  

loaded via a namespace (and not attached):
 [1] vctrs_0.6.5      knitr_1.50       slam_0.1-55      cli_3.6.5        xfun_0.52        rlang_1.1.6      stringi_1.8.7    purrr_1.1.0      renv_1.0.11     
[10] generics_0.1.4   glue_1.8.0       hms_1.1.3        evaluate_1.0.4   tzdb_0.5.0       lifecycle_1.0.4  compiler_4.5.1   pacman_0.5.1     Rcpp_1.1.0      
[19] pkgconfig_2.0.3  R6_2.6.1         tidyselect_1.2.1 pillar_1.11.0    parallel_4.5.1   magrittr_2.0.3   tools_4.5.1      xml2_1.3.8

Convert PDF to .TXT

You can download the xpdf app at the following link. Version xpdf.4.05 was used.

# 01 Convert PDF ###############################################################
# 01.1 Select the path and the files ===========================================
path  <- "./data/pdf_full" #Select the location of PDFs files
files <- list.files(path = "./data/pdf_full/", pattern = "pdf")  #Make a vector of PDFs in the folder files not included

xpdf <- "C:/Program Files/xpdf-tools-win-4.05/bin64/pdftotext.exe"  #Path for XPDF tool accessible at (https://www.xpdfreader.com/download.html)


# 01.2 Transform and export in the same folder as the pdf ======================
for (i in 1:length(files)){
  pdf <- file.path(path, files[i])
  system(paste("\"",xpdf,"\" \"", pdf, "\"", sep =""), wait = FALSE)
  }

rm(list=ls())

Prepare the text files

# 02 Prepare the txt files #####################################################
# 02.1 Import the data under Corpus file =======================================
corpus <- VCorpus(DirSource(directory = "./data/txt_full/",
                            pattern = ".txt"))  #Corpus from the package files not included

# 02.2 Clean the elements ======================================================
df <- corpus

# Function to remove element and replace it by blank space
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))

# Remove some characters
df <- tm_map(df, toSpace, "/")
df <- tm_map(df, toSpace, "@")
df <- tm_map(df, toSpace, "\\|")

# Put all tall letters to smaller
df <- tm_map(df, content_transformer(tolower)) 

# Remove numbers
df <- tm_map(df, removeNumbers)
df <- tm_map(df, removePunctuation)
df <- tm_map(df, stripWhitespace)

# 02.3 Put into data frame and export ==========================================

# Create a data frame with eh text and the ID of text(file name)
data <- data.frame(id=sapply(df, meta, "id"),
                     text=unlist(lapply(sapply(df, '[', "content"),paste,collapse="\n")),
                     stringsAsFactors=FALSE) 

# Remove page number
data$text <- gsub("[\r\n]", "", data$text)

# Check the first element
data$text[1]
write.table(data, "./export/pre_process/export_data.txt", sep = ";", row.names = FALSE , col.names = FALSE)

# 02.4 Remove reference part ===================================================

# For separating the different part of the file and removing the "reference part"
remove_references <- function(x){
  z <- as.data.frame(x[,1]) 
  z$text <-x[,2]
  z$check <- grepl("introduction", z$text)    # Look for introduction word
  
  z$number <- ifelse(z$check == TRUE, z$number <- 3 , z$number <- 0)
  
  y <- z
  y$check <- grepl("abstract", y$text)    # Look for abstract word
  y$number <- ifelse(y$check == TRUE, y$number <- 2, y$number <- 0 )
  z$number <- ifelse(z$number == 3, z$number <- 3, z$number <- y$number)
  
  y$check <- grepl("doi", y$text)     # Look for doi mention
  y$number <- ifelse(y$check == TRUE, y$number <- 1, y$number <- 0 )
  z$number <- ifelse(z$number > 0, z$number <- z$number, z$number <- y$number)
  
  # Separate
  z$intro <- ifelse(z$number == 3, z$intro <- regmatches(z$text, gregexpr("(?<=introduction).*?(?=references)", z$text, perl=TRUE)), z$intro <- NA)
  z$abs <- ifelse(z$number >= 2, z$abs <- regmatches(z$text, gregexpr("(?<=abstract).*?(?=references)", z$text, perl=TRUE)), z$abs <- NA)
  z$doi <- ifelse(z$number >= 1, z$doi <- regmatches(z$text, gregexpr("(?<=doi).*?(?=references)", z$text, perl=TRUE)), z$doi <- NA)
 return(z)
}

clean <-  remove_references(data)

# Remove the ".pdf" old information to remove any issues with further analysis
clean[,1]<- gsub(".pdf", "", clean[,1]) 
colnames(clean) <- c("Id", "text", "check", "number", "intro",  "abs", "doi")

# 02.5 Export the elements =====================================================
save(list = c("clean", "corpus"), file = "./export/pre_process/Pre-process.RData")

rm(list=ls())

Combine the metadata and other elements

The collected information on the corpus can be found as a export_data.txt and Pre-process.RData files.

# 03 Combine the different element #############################################
load("./export/pre_process/Pre-process.RData")

intro <- subset(clean, clean$number == 3)
abs <- subset(clean, clean$number == 2)
doi <- subset(clean, clean$number == 1)

# Text can be seen as full but only filled with "character(0)" string

# 03.1 Clean the Introduction selection ========================================
intro$char <- intro$intro == "character(0)"  
intro_final <- subset(intro, char == FALSE)

intro <- subset(intro, char == TRUE)
intro$char <- intro$abs == "character(0)"
abs_final <- subset(intro, char == FALSE)
abs_final$number <- 2

intro <- subset(intro, char == TRUE)
intro$char <- intro$doi == "character(0)"
doi_final <- subset(intro, char == FALSE)
doi_final$number <- 1

no_resume <- subset(intro, char == TRUE)
no_resume$number <- 0

# 03.2 Clean the abstract selection ============================================
abs$char <- abs$abs == "character(0)"
abs_final2 <- subset(abs, char == FALSE)

abs <- subset(abs, char == TRUE)
abs$char <- abs$doi == "character(0)"
doi_final2 <- subset(abs, char == FALSE)
doi_final2$number <- 1

no_resume2 <- subset(abs, char == TRUE)
no_resume2$number <- 0

# 03.3 Clean the selection without abstract and introduction ===================
doi$char <- doi$doi == "character(0)"
doi_final3 <- subset(doi, char == FALSE)

no_resume3 <- subset(doi, char == TRUE)
no_resume3$number <- 0

# 03.4 Merge all the data ======================================================

names <- c("Id", "Full_text", "Number","Removed_ref")

intro <- intro_final[,c(1:2,4:5)] #merge the introduction selections 
colnames(intro) <- names

abs <- rbind(abs_final, abs_final2) #merge the abstract selections 
abs <- abs [,c(1:2,4,6)]
colnames(abs) <- names

doi <- rbind(doi_final, doi_final2, doi_final3) #merge the selections without abstract and introduction
doi <- doi [,c(1:2,4,7)]
colnames(doi) <- names

# Not_treated data have to analyzed again to see if "abstract" or "introduction" word are not embedded with others which can make the greplx function inefficient
# Once check and modified if necessary (remove the reference part manually) you would have to upload again later in the steps.
not_treated <- rbind(no_resume, no_resume2 , no_resume3)
not_treated <- not_treated [,c(1:2,4)]

no_doi <- subset(clean, is.na(clean$doi) == TRUE) # Look if a references part as not been spltited from an article
no_doi <- no_doi [,c(1:2,4)]
not_treated <- rbind(not_treated, no_doi)
colnames(not_treated) <- c ("Id", "Full_text", "Number") 

final_data <- rbind(intro, abs, doi) #Combine all the data in one data frame


write.table(not_treated$Id, "./export/pre_process/not_treated.txt", sep = ";", row.names = FALSE , col.names = FALSE)

# 03.5 Add the articles with embedded or no references part =====================
# Repeat the steps 2 until 2.4 step. for the "not_treated" articles once the references have been removed manually
save(list = c("clean", "corpus", "final_data", "not_treated"), file = "./export/pre_process/Filtered_references.RData")

rm(list=ls())

First filter

The Metadata were collected from the information available from the DOI and imports from the different web libraries.

# 04 First filter ##############################################################
load("./export/pre_process/Filtered_references.RData")

# 04.1 Import the metadata (Exported from the RIS file in Zotero in CSV format) 
Metadata <- read_delim("./data/Metadata.csv", delim =";", na = "NA") #File not included in the data set
Metadata$File.Attachments[Metadata$File.Attachments == ""] <-NA
Metadata <- Metadata %>% drop_na(File.Attachments) #Remove the ones without Pdfs

You need to verify that your FileAttachments columns is well written only with the file name without anything else

# 04.2 Filter only Articles from scientific journals ============================
Metadata <- subset(Metadata, Metadata$Item.Type == "journalArticle") #Select only journal articles
final_data <- subset(final_data, final_data$Id %in% Metadata$File.Attachments)#Select in the data set journal articles

write_delim(Metadata, "./export/pre_process/Metadata_clean.csv", delim = ";")

# 04.3 Function for filtering the data =========================================
filter_func <- function(x, z){
  
  list_archeo <- c("archaeology", "archeology", "archaeological", "archeological") #List of archaeological words related
  list_ML <- c("machine learning", "ml", "deep learning", "artificial intelligence") #List of ML words related
  
  #Filter for the archaeological words
  x$check <- grepl(list_archeo[1], z)
  x$number <- ifelse(x$check == TRUE, x$number <- 1 , x$number <- 0)
  x$archeo_value <- x$number
  
  x$check <- grepl(list_archeo[2], z)
  x$number <- ifelse(x$check == TRUE, x$number <- 1 , x$number <- 0)
  x$archeo_value <- x$archeo_value + x$number
  
  x$check <- grepl(list_archeo[3], z)
  x$number <- ifelse(x$check == TRUE, x$number <- 1 , x$number <- 0)
  x$archeo_value <- x$archeo_value + x$number
  
  x$check <- grepl(list_archeo[4], z)
  x$number <- ifelse(x$check == TRUE, x$number <- 1 , x$number <- 0)
  x$archeo_value <- x$archeo_value + x$number

  
  #Filter for the machine learning words
  x$check <- grepl(list_ML[1], z)
  x$number <- ifelse(x$check == TRUE, x$number <- 1 , x$number <- 0)
  x$ML_value <- x$number
  
  x$check <- grepl(list_ML[2], z)
  x$number <- ifelse(x$check == TRUE, x$number <- 1 , x$number <- 0)
  x$ML_value <- x$ML_value + x$number
  
  x$check <- grepl(list_ML[3], z)
  x$number <- ifelse(x$check == TRUE, x$number <- 1 , x$number <- 0)
  x$ML_value <- x$ML_value + x$number
  
  x$check <- grepl(list_ML[4], z)
  x$number <- ifelse(x$check == TRUE, x$number <- 1 , x$number <- 0)
  x$ML_value <- x$ML_value + x$number

  x$filtered <- ifelse(x$archeo_value > 0 & x$ML_value > 0, x$filtered <- 1, x$filtered <- 0)  

  return(x)
}

# 04.4 Apply the filtering to the articles without references part =============
first_filtered_data <- filter_func(final_data, final_data$Removed_ref)

# 04.5 Subset the different results ============================================
# Subset the article without archeo keywords
no_archeo <- subset(first_filtered_data, first_filtered_data$archeo_value == 0)

# Subset the article without ML keywords
no_ML <- subset(first_filtered_data, first_filtered_data$ML_value == 0)

# Subset the article without either archeo or ML keywords
no_combined <- subset(first_filtered_data, first_filtered_data$filtered == 0)

# Subset the article with archeo AND ML keywords
first_filtered_data <- subset(first_filtered_data, first_filtered_data$filtered == 1)

# 04.6 Export the resuts =======================================================
write.table(no_archeo[1], "./export/first_filter/first_filter_no_archeo.csv", sep = "\t", row.names = FALSE , col.names = FALSE)
write.table(no_ML[1], "./export/first_filter/first_filter_no_ML.csv", sep = "\t", row.names = FALSE , col.names = FALSE)
write.table(no_combined[1], "./export/first_filter/first_filter_no_combined.csv", sep = "\t", row.names = FALSE , col.names = FALSE)
write.table(first_filtered_data[1], "./export/first_filter/first_filtered.csv", sep = "\t", row.names = FALSE , col.names = FALSE)

save(list = c("final_data", "filter_func", "first_filtered_data","no_combined", "Metadata"), file = "./export/first_filter/First_filter.RData")
rm(list=c(ls()))

Second filter

# 05 Second filter #############################################################
load("./export/first_filter/First_filter.RData")

# 05.1 Filter the metadata =====================================================
Metadata_first_filter <- subset(Metadata, Metadata$File.Attachments %in% first_filtered_data$Id)  #Export the metadata from the first filter
write_delim(Metadata_first_filter, "./export/first_filter/Metadata_first_filter.csv", delim = ";")

# 05.2 Merge metadata and first filtered ones ==================================
second_filtered_data <- merge(first_filtered_data, Metadata, by.x = "Id", by.y = "File.Attachments")

# Only select articles with an abstract
second_filtered_data$Abstract.Note[second_filtered_data$Abstract.Note== ""] <-NA
second_filtered_data <- subset(second_filtered_data, is.na(second_filtered_data$Abstract.Note) == FALSE) 

# 05.3 Filter the abstract =====================================================
x <- second_filtered_data
z <- x$Abstract.Note

# Function filtering the data for the abstract
abstract_filtered_data <- filter_func(x, z)

# Subset the article without archeo keywords
no_archeo_abstract <- subset(abstract_filtered_data, abstract_filtered_data$archeo_value == 0)

# Subset the article without ML keywords
no_ML_abstract <- subset(abstract_filtered_data, abstract_filtered_data$ML_value == 0)

# Subset the article without either archeo or ML keywords
no_combined_abstract <- subset(abstract_filtered_data, abstract_filtered_data$filtered == 0)

# Subset the article with archeo AND ML keywords
abstract_filtered_data <- subset(abstract_filtered_data, abstract_filtered_data$filtered == 1)

# 05.4 Export the results ======================================================
write.table(no_archeo_abstract[1], "./export/second_filter/Abstract_filtered_no_archeo.csv", sep = "\t", row.names = FALSE , col.names = FALSE)
write.table(no_ML_abstract[1], "./export/second_filter/Abstract_filtered_no_ML.csv", sep = "\t", row.names = FALSE , col.names = FALSE)
write.table(no_combined_abstract[1], "./export/second_filter/Abstract_filtered_no_combined.csv", sep = "\t", row.names = FALSE , col.names = FALSE)
write.table(abstract_filtered_data[1], "./export/second_filter/Abstract_filtered.csv", sep = "\t", row.names = FALSE , col.names = FALSE)

save(list = c("no_archeo_abstract", "filter_func",  "no_combined_abstract", "no_ML_abstract", "abstract_filtered_data"), file = "./export/second_filter/Abstract_filtered.RData")

rm(list=c("no_archeo_abstract", "no_combined_abstract", "no_ML_abstract"))

# 05.5 Filter the title ========================================================
x <- second_filtered_data
z <- x$Title

# Function filtering the data ==================================================
title_filtered_data <- filter_func(x, z)

# Subset the article without archeo keywords
no_archeo_title <- subset(title_filtered_data, title_filtered_data$archeo_value == 0)

# Subset the article without ML keywords
no_ML_title <- subset(title_filtered_data, title_filtered_data$ML_value == 0)

# Subset the article without either archeo or ML keywords
no_combined_title <- subset(title_filtered_data, title_filtered_data$filtered == 0)

# Subset the article with archeo AND ML keywords
title_filtered_data <- subset(title_filtered_data, title_filtered_data$filtered == 1)

# 05.6 Export the results ======================================================
write.table(no_archeo_title[1], "./export/second_filter/Title_filtered_no_archeo.csv", sep = "\t", row.names = FALSE , col.names = FALSE)
write.table(no_ML_title[1], "./export/second_filter/Title_filtered_no_ML.csv", sep = "\t", row.names = FALSE , col.names = FALSE)
write.table(no_combined_title[1], "./export/second_filter/Title_filtered_no_combined.csv", sep = "\t", row.names = FALSE , col.names = FALSE)
write.table(title_filtered_data[1], "./export/second_filter/Title_filtered.csv", sep = "\t", row.names = FALSE , col.names = FALSE)
save(list = c("no_archeo_title", "no_ML_title", "no_combined_title", "title_filtered_data"), file = "./export/second_filter/Title_filtered.RData")

rm(list=c("no_archeo_title", "no_ML_title", "no_combined_title","x","z"))

# 05.7 Combine the title and abstract filters ==================================
second_filtered_data <- rbind(title_filtered_data, abstract_filtered_data)
second_filtered_data <- second_filtered_data[c(1:5)]
second_filtered_data <- distinct(second_filtered_data) #Remove duplicates

write.table(second_filtered_data[1], "./export/second_filter/second_filtered.txt", sep = ";", row.names = FALSE , col.names = FALSE)

# 05.8 Export the metadata with the second filter ==============================
Metadata_second_filter <- subset(Metadata, Metadata$File.Attachments %in% second_filtered_data$Id)  #Export the metadata from the first filter
write_delim(Metadata_second_filter, "./export/second_filter/Metadata_second_filter.csv", delim = ";")

save(list = c("second_filtered_data", "Metadata_second_filter"), file = "./export/second_filter/Final_data_automatic_process.RData")

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Filtering

Preparation

Convert PDF to .TXT

Prepare the text files

Combine the metadata and other elements

First filter

Second filter

FilesExpand file tree

01 - Filtering.md

Latest commit

History

01 - Filtering.md

File metadata and controls

Filtering

Preparation

Convert PDF to .TXT

Prepare the text files

Combine the metadata and other elements

First filter

Second filter