Quanteda is notoriously unstable in terms of functions being updated frequently and breaking pipelines. Hence hard requiring a stable version of quanteda that was used for initial analysis for full replicability.
library(readtext)
library(quanteda)
library(tidyverse)
library(stm)
library(tidytext)
library(haven)
library(data.table)
library(ggridges)
library(viridis)
UNGD data are available on the Harvard Dataverse at https://doi.org/10.7910/DVN/0TJX8Y
DATA_DIR <- "~/Dropbox/Research/UNGDC projects/UN Data/"
ungd_files <- readtext(paste0(DATA_DIR, "TXT/*"),
docvarsfrom = "filenames",
dvsep="_",
docvarnames = c("Country", "Session", "Year"))
ungd_corpus <- corpus(ungd_files, text_field = "text")
#Analysis
###Wordscore based positions
tok <- tokens(ungd_corpus, what = "word",
remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE,
remove_twitter = TRUE,
remove_url = TRUE,
remove_hyphens = TRUE,
verbose = TRUE)
dfm <- dfm(tok,
tolower = TRUE,
remove=stopwords("english"),
stem=TRUE,
verbose = TRUE)
#Removing any digits. `dfm` picks up any separated digits, not digits that are part of tokens.
#Removing any punctuation. `dfm` picks up any punctuation unless it's part of a token.
#Removing any tokens less than four characters.
dfm.m <- dfm_select(dfm, c("[\\d-]", "[[:punct:]]", "^.{1,3}$"), selection = "remove",
valuetype="regex", verbose = TRUE)
#Dropping words that appear less than 5 times and in less than 3 documents.
dfm.trim <- dfm_trim(dfm.m, min_termfreq = 5, min_docfreq = 3, verbose = TRUE)
#group documents by year in dfm
dfm.year <- dfm_group(dfm.trim, dfm.trim@docvars$Year)
# get names of variables to be dropped
dfm.year.trimmed <- dfm_trim(dfm.year, max_docfreq=1, verbose = TRUE)
df.year.trimmed <- as.data.frame(dfm.year.trimmed)
variables_drop <- names(df.year.trimmed)
variables_drop <- variables_drop[2:length(variables_drop)]
#group documents by year in dfm
dfm.year_all <- dfm_group(dfm.m, dfm@docvars$Year)
# get names of variables to be dropped
df.year.trimmed_all <- as.data.frame(dfm_trim(dfm.year_all, max_docfreq=1))
#Logicals for EU member states
EU <- c("BEL", "FRA", "DEU", "ITA", "LUX", "NLD")
wave1 <- c("DNK", "IRL", "GBR")
wave2 <- "GRC"
wave3 <- c("ESP", "PRT")
wave4 <- c("AUT", "FIN", "SWE")
wave5 <- c("CZE", "HUN", "POL", "EST", "LVA", "LTU", "CYP", "MLT", "SVK", "SVN")
wave6 <- c("BGR", "ROU")
wave7 <- "HRV"
presus2 <- presus
presus2 <- presus2 %>% mutate(is.eu = Country %in% EU)
presus2$waves <- presus2$is.eu
presus2$waves <- presus2$is.eu
#first wave
presus2$is.eu[presus2$Country== "DNK" & presus2$Year >1972] <- TRUE
presus2$is.eu[presus2$Country== "IRL" & presus2$Year >1972] <- TRUE
presus2$is.eu[presus2$Country== "GBR" & presus2$Year >1972] <- TRUE
#second wave
presus2$is.eu[presus2$Country== "GRC" & presus2$Year >1980] <- TRUE
#third wave
presus2$is.eu[presus2$Country== "ESP" & presus2$Year >1985] <- TRUE
presus2$is.eu[presus2$Country== "PRT" & presus2$Year >1985] <- TRUE
#fourth wave
presus2$is.eu[presus2$Country== "AUT" & presus2$Year >1994] <- TRUE
presus2$is.eu[presus2$Country== "FIN" & presus2$Year >1994] <- TRUE
presus2$is.eu[presus2$Country== "SWE" & presus2$Year >1994] <- TRUE
#fifth wave
presus2$is.eu[presus2$Country== "CZE" & presus2$Year >2003] <- TRUE
presus2$is.eu[presus2$Country== "HUN" & presus2$Year >2003] <- TRUE
presus2$is.eu[presus2$Country== "POL" & presus2$Year >2003] <- TRUE
presus2$is.eu[presus2$Country== "EST" & presus2$Year >2003] <- TRUE
presus2$is.eu[presus2$Country== "LVA" & presus2$Year >2003] <- TRUE
presus2$is.eu[presus2$Country== "LTU" & presus2$Year >2003] <- TRUE
presus2$is.eu[presus2$Country== "CYP" & presus2$Year >2003] <- TRUE
presus2$is.eu[presus2$Country== "MLT" & presus2$Year >2003] <- TRUE
presus2$is.eu[presus2$Country== "SVK" & presus2$Year >2003] <- TRUE
presus2$is.eu[presus2$Country== "SVN" & presus2$Year >2003] <- TRUE
#sixth wave
presus2$is.eu[presus2$Country== "BGR" & presus2$Year >2006] <- TRUE
presus2$is.eu[presus2$Country== "ROU" & presus2$Year >2006] <- TRUE
#seventh wave
presus2$is.eu[presus2$Country== "HRV" & presus2$Year >2012] <- TRUE
presrus2 <- presrus
presrus2 <- presrus2 %>% mutate(is.eu = Country %in% EU)
#first wave
presrus2$is.eu[presrus2$Country== "DNK" & presrus2$Year >1972] <- TRUE
presrus2$is.eu[presrus2$Country== "IRL" & presrus2$Year >1972] <- TRUE
presrus2$is.eu[presrus2$Country== "GBR" & presrus2$Year >1972] <- TRUE
#second wave
presrus2$is.eu[presrus2$Country== "GRC" & presrus2$Year >1980] <- TRUE
#third wave
presrus2$is.eu[presrus2$Country== "ESP" & presrus2$Year >1985] <- TRUE
presrus2$is.eu[presrus2$Country== "PRT" & presrus2$Year >1985] <- TRUE
#fourth wave
presrus2$is.eu[presrus2$Country== "AUT" & presrus2$Year >1994] <- TRUE
presrus2$is.eu[presrus2$Country== "FIN" & presrus2$Year >1994] <- TRUE
presrus2$is.eu[presrus2$Country== "SWE" & presrus2$Year >1994] <- TRUE
#fifth wave
presrus2$is.eu[presrus2$Country== "CZE" & presrus2$Year >2003] <- TRUE
presrus2$is.eu[presrus2$Country== "HUN" & presrus2$Year >2003] <- TRUE
presrus2$is.eu[presrus2$Country== "POL" & presrus2$Year >2003] <- TRUE
presrus2$is.eu[presrus2$Country== "EST" & presrus2$Year >2003] <- TRUE
presrus2$is.eu[presrus2$Country== "LVA" & presrus2$Year >2003] <- TRUE
presrus2$is.eu[presrus2$Country== "LTU" & presrus2$Year >2003] <- TRUE
presrus2$is.eu[presrus2$Country== "CYP" & presrus2$Year >2003] <- TRUE
presrus2$is.eu[presrus2$Country== "MLT" & presrus2$Year >2003] <- TRUE
presrus2$is.eu[presrus2$Country== "SVK" & presrus2$Year >2003] <- TRUE
presrus2$is.eu[presrus2$Country== "SVN" & presrus2$Year >2003] <- TRUE
#sixth wave
presrus2$is.eu[presrus2$Country== "BGR" & presrus2$Year >2006] <- TRUE
presrus2$is.eu[presrus2$Country== "ROU" & presrus2$Year >2006] <- TRUE
#seventh wave
presrus2$is.eu[presrus2$Country== "HRV" & presrus2$Year >2012] <- TRUE
presus2 <- mutate(presus2, originals = Country %in% EU)
presus2 <- mutate(presus2, wave1 = Country %in% wave1)
presus2 <- mutate(presus2, wave2 = Country %in% wave2)
presus2 <- mutate(presus2, wave3 = Country %in% wave3)
presus2 <- mutate(presus2, wave4 = Country %in% wave4)
presus2 <- mutate(presus2, wave5 = Country %in% wave5)
presus2 <- mutate(presus2, wave6 = Country %in% wave6)
presus2 <- mutate(presus2, wave7 = Country %in% wave7)
names(presus2)[4] <- "PRESUS_wscore"
names(presrus2)[4] <- "PRESRUS_wscore"
names(presus2)[5] <- "EU"
scores <- full_join(presus2, presrus2, by=c("Country", "Year"))
scores <- mutate(scores, eu9 = originals)
scores$eu9[scores$Country== "DNK" & scores$Year >1972] <- TRUE
scores$eu9[scores$Country== "IRL" & scores$Year >1972] <- TRUE
scores$eu9[scores$Country== "GBR" & scores$Year >1972] <- TRUE
scores <- mutate(scores, eu12 = eu9)
scores$eu12[scores$Country== "GRC" & scores$Year >1980] <- TRUE
scores$eu12[scores$Country== "ESP" & scores$Year >1985] <- TRUE
scores$eu12[scores$Country== "PRT" & scores$Year >1985] <- TRUE
scores <- mutate(scores, eu15 = eu12)
scores$eu15[scores$Country== "AUT" & scores$Year >1994] <- TRUE
scores$eu15[scores$Country== "FIN" & scores$Year >1994] <- TRUE
scores$eu15[scores$Country== "SWE" & scores$Year >1994] <- TRUE
Calculating average scores and standard deviations for EU member states.
eu6 <- filter(scores, originals==TRUE) %>%
group_by( . , Year) %>%
summarise(. , score_presus_eu6 = mean(PRESUS_wscore), sd_presus_eu6 = sd(PRESUS_wscore),
score_presrus_eu6 = mean(PRESRUS_wscore), sd_presrus_eu6 = sd(PRESRUS_wscore))
eu10 <- filter(scores, wave5==TRUE) %>%
group_by( . , Year) %>%
summarise(. , score_presus_eu10 = mean(PRESUS_wscore, na.rm = TRUE), sd_presus_eu10 = sd(PRESUS_wscore, na.rm = TRUE), score_presrus_eu10 = mean(PRESRUS_wscore, na.rm = TRUE), sd_presrus_eu10 = sd(PRESRUS_wscore, na.rm = TRUE))
eu9 <- filter(scores, eu9==TRUE) %>%
group_by( . , Year) %>%
summarise(. , score_presus_eu9 = mean(PRESUS_wscore), sd_presus_eu9 = sd(PRESUS_wscore),
score_presrus_eu9 = mean(PRESRUS_wscore), sd_presrus_eu9 = sd(PRESRUS_wscore))
eu12 <- filter(scores, eu12==TRUE) %>%
group_by( . , Year) %>%
summarise(. , score_presus_eu12 = mean(PRESUS_wscore), sd_presus_eu12 = sd(PRESUS_wscore),
score_presrus_eu12 = mean(PRESRUS_wscore), sd_presrus_eu12 = sd(PRESRUS_wscore))
eu15 <- filter(scores, eu15==TRUE) %>%
group_by( . , Year) %>%
summarise(. , score_presus_eu15 = mean(PRESUS_wscore), sd_presus_eu15 = sd(PRESUS_wscore),
score_presrus_eu15 = mean(PRESRUS_wscore), sd_presrus_eu15 = sd(PRESRUS_wscore))
eu_presus <- summarise(group_by(filter(scores, EU==TRUE), Year),
score_presus = mean(PRESUS_wscore, na.rm = TRUE),
sd_presus = sd(PRESUS_wscore, na.rm = TRUE))
eu_presrus <- summarise(group_by(filter(scores, EU==TRUE), Year),
score_presrus = mean(PRESRUS_wscore, na.rm = TRUE),
sd_presrus = sd(PRESRUS_wscore, na.rm = TRUE))
av_scores <- left_join(eu6, eu9, by="Year" )
av_scores <- left_join(av_scores, eu12, by="Year" )
av_scores <- left_join(av_scores, eu15, by="Year" )
av_scores <- left_join(av_scores, eu10, by="Year" )
av_scores <- left_join(av_scores, eu_presus, by="Year" )
av_scores <- left_join(av_scores, eu_presrus, by="Year" )
Combining files for output:
readr::write_csv(av_scores, "average_estimates_v1_2_jeffreys.csv")
readr::write_csv(scores, "estimates_v1_2_jeffreys.csv")
readr::write_csv(presrus_words, "presrus_words.csv")
readr::write_csv(presus_words, "presus_words.csv")
