1 Introduction

This document is the technical appendix of the article entitled “Six Decades of Economic Research at the Bank of England” to be published in the journal History of Political Economy. The article is written by Juan Acosta, Beatrice Cherrier, François Claveau, Clément Fontan, Aurélien Goutsmedt, Francesco Sergi. The part of the project contained in this technical appendix is the responsibility of Aurélien Goutsmedt and François Claveau. We also thank Jérémie Dion, Davide Pulizzotto and Maxime Tremblay for their help.

Please email François Claveau for questions and requests.

First, we present the two databases that we have built in the context of the Rebuilding Macroeconomics project Excavating the Academia/Policy Pipeline: Economic analysis at the Bank of England Pre and Post-Crisis.

Second, using these databases, we produce figures and tables. Some of these results are reproduced in the article while others are supplemental material only available here. As well as sharing the code for these results, we provide information on the methods used.

Here is the description of the R session and packages we have used for generating the following visualisations:

#packages ------------------------------------------

package_list <- c("knitr", "yaml", "DT", "kableExtra", "viridis",
                  "dplyr", "data.table", "ggplot2", "directlabels", "magrittr",
                  "tidyr", "tibble", "ggrepel","zoo","tidytext")
for(p in package_list){
  if (p %in% installed.packages()==FALSE){install.packages(p,dependencies = TRUE)}
  library(p,character.only=TRUE)
}


# files and variables-----------
# the file storing relevant paths.
#Note that it is user specific and stored **just above the git project path**
paths <- read.csv("../paths_for_centralbank_project.csv",sep = ";", 
                  skip=3,colClasses = c("character","character")) 
categories_to_scrape <- read.csv("corpus/data/BoE_categories_to_scrape.csv",
                                 header = FALSE)[,1] %>% as.character()
path_BoE <- "BankofEngland/"
pdf_folder <- "pdf/"
# folder where pictures for the article are saved:
article_picture_folder <- paste0( paths[2,"path"], path_BoE,"Articles/HOPE_art1/figures/") 
dir.create(article_picture_folder,recursive = T,showWarnings = FALSE)
root_url_BoE <- "https://www.bankofengland.co.uk"
f_metadata <- "dt_BoE_metadata"
f_paragraphs <- "dt_BoE_combined_paragraph"
f_people <- "dt_people"
f_doc_author <- "dt_author_doc"
f_refs <- "dt_references"
source("functions/functions_for_corpus.R")

path_WoS <- "WebofScience/"
timespan = 1970:2018
linewidth =1.5
n_none <-  "UNKNOWN"
fig_i <- 0 # counter for figure number


CSV_path <- "BoE_Network_Analysis/"

# Loading BoE database

dt_BoE <- readRDS(paste0(paths[1,"path"],path_BoE,f_metadata,".rds"))

dt_people <- readRDS(file = paste0(paths[1,"path"],path_BoE,f_people,".rds"))  

dt_doc_authors <- readRDS(file = paste0(paths[1,"path"],path_BoE,f_doc_author,".rds")) 

dt_ref <- readRDS(file = paste0(paths[1,"path"],path_BoE, f_refs, ".rds")) 

# Loading WoS database 
f_idart_w_phys_address <- "Articles_based_on_CB_addresses"
f_idart_w_email_address <- "Articles_based_on_CB_emails"
f_articles <- "CB_articles"
f_authors <- "CB_authors"
f_references <- "CB_references"
dt_w_address <- readRDS(file = paste0(paths[1,"path"],
                                      path_WoS,f_idart_w_phys_address,".rds"))
dt_w_emails <-  readRDS(file = paste0(paths[1,"path"],
                                      path_WoS,f_idart_w_email_address,".rds"))
dt_articles <- readRDS( file = paste0(paths[1,"path"],
                                      path_WoS,f_articles,".rds"))
dt_authors <- readRDS(file = paste0(paths[1,"path"],
                                    path_WoS,f_authors,".rds"))
dt_references <- readRDS(file = paste0(paths[1,"path"],
                                       path_WoS,f_references,".rds"))

# Loading prosopogrpahic database in the old format----------
path_temp_prosopo <- "old_format_prosopo_data/"


Prosopo_Individuals <- readRDS(
  file =
    paste0(
      paths[1, "path"],
      path_BoE,
      path_temp_prosopo,
      "Prosopographic Database - Individuals.rds"
    )
)
Prosopo_Individuals <- Prosopo_Individuals[Gender != "", ]


Universities <- readRDS(
  file = paste0(
    paths[1, "path"],
    path_BoE,
    path_temp_prosopo,
    "Prosopographic Database - Index Training Places.rds"
  )
)

Prosopo_BoE_Position <- readRDS(
  paste0(
    paths[1, "path"],
    path_BoE,
    path_temp_prosopo,
    "Prosopographic Database - BoE Position.rds"
  )
)
Prosopo_Non_BoE_Position <-
  readRDS(
    paste0(
      paths[1, "path"],
      path_BoE,
      path_temp_prosopo,
      "Prosopographic Database - Non BoE Position.rds"
    )
  )

source("functions/functions_for_prosopographic_database.R")
source("functions/functions_for_network_analysis.R")

# loading topic modeling data-----------

picture_topic_path <- paste0(paths[2,"path"], "full_text_analysis/Pictures/")

topic_model_path <- paste0(paths[1,"path"], path_BoE, "topic_model_BoE_research_docs/")
Period_topics <- fread( paste0( topic_model_path, "Period_topics.csv"), sep = ";")
Types_topics <- fread(paste0( topic_model_path, "Types_topics.csv"), sep = ";")
Doc_topics <- fread(paste0( topic_model_path, "Doc_topics.csv"), sep = ";")
Words_topics <- read.csv(paste0(topic_model_path,"top50words_per_topic",".csv"), 
                         sep = ",")



# path to pictures and colors variables-----------
#picture_path <- paste0(paths[2,"path"], "BoE_Network_Analysis/Pictures/")
#dir.create(picture_path)

mypalette1 <- c("#B2EEF8","#6DEEFB","#F998B9","#6DE78F","#FBDA28","#FA813E",
                "#899EE1","#F7A9A8","#CD242E","#63387B","#4799D2","#13C3F1",
                "#EF476F","#3CB95F","#C7960B","#B64017","#3E498D","#793F3F",
                "#720A0D","#28093D")
mypalette <- c("#1969B3","#01A5D8","#DA3E61","#3CB95F","#E0AF0C","#E25920",
               "#6C7FC9","#DE9493","#CD242E","#6F4288","#B2EEF8","#7FF6FD",
               "#FDB8D6","#8BF9A9","#FEF34A","#FEC57D","#DAEFFB","#FEE3E1",
               "#FBB2A7","#EFD7F2") 
mypalette2 <- c("#1969B3","#01A5D8","#DA3E61","#3CB95F","#E0AF0C","#E25920",
                "#6C7FC9","#DE9493","#CD242E","#6F4288","#B2EEF8","#7FF6FD",
                "#FDB8D6","#8BF9A9","#FEF34A","#FEC57D","#DAEFFB","#FEE3E1",
                "#FBB2A7","#EFD7F2","#5CAADA","#37D4F5","#F5779B","#62E186",
                "#FBDA28","#FB8F4A","#A4B9EA","#FAC2C0","#EB6466","#AD87BC",
                "#0B3074","#00517C","#871B2A","#1A6029","#7C4B05","#8A260E",
                "#2E3679","#793F3F","#840F14","#401C56","#003C65","#741A09",
                "#602A2A","#34134A","#114A1B","#27DDD1","#27DD8D","#4ADD27",
                "#D3DD27","#DDA427","#DF2935","#DD27BC","#BA27DD","#3227DD",
                "#2761DD","#27DDD1")

string_for_BW_print_version <- "grey_for_print_"

### Mapping of document categories -----
categ_map <-  data.table(
  old_category = c("research-paper-in-quarterly-bulletin",
                  "financial-stability-paper",
                  "external-mpc-discussion-paper",
                  "working-paper",
                  "WoS-articles"),
  new_category = c("Research in\nQuarterly Bulletin",
                   "Financial Stability Papers",
                   "External MPC\nDiscussion Papers",
                   "Working Papers",
                   "External Articles"
  ))

if(outputFormat == "html"){
 sessionInfo() 
}
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] docstring_1.0.0        tidytext_0.2.5         zoo_1.8-8             
##  [4] ggrepel_0.8.2          tibble_3.0.3           tidyr_1.1.1           
##  [7] magrittr_1.5           directlabels_2020.6.17 ggplot2_3.3.2         
## [10] data.table_1.13.0      dplyr_1.0.1            viridis_0.5.1         
## [13] viridisLite_0.3.0      kableExtra_1.1.0       DT_0.14               
## [16] yaml_2.2.1             knitr_1.29            
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.1.0  xfun_0.16         purrr_0.3.4       lattice_0.20-41  
##  [5] colorspace_1.4-1  vctrs_0.3.2       generics_0.0.2    htmltools_0.5.0  
##  [9] SnowballC_0.7.0   rlang_0.4.11      pillar_1.4.6      glue_1.4.1       
## [13] withr_2.2.0       lifecycle_1.0.0   stringr_1.4.0     munsell_0.5.0    
## [17] gtable_0.3.0      rvest_1.0.0       htmlwidgets_1.5.1 evaluate_0.14    
## [21] tokenizers_0.2.1  Rcpp_1.0.5        readr_1.3.1       scales_1.1.1     
## [25] webshot_0.5.2     gridExtra_2.3     hms_0.5.3         digest_0.6.25    
## [29] stringi_1.4.6     grid_3.6.3        quadprog_1.5-8    tools_3.6.3      
## [33] janeaustenr_0.1.5 crayon_1.3.4      pkgconfig_2.0.3   ellipsis_0.3.1   
## [37] Matrix_1.2-18     xml2_1.3.2        roxygen2_7.1.1    rmarkdown_2.3    
## [41] httr_1.4.2        rstudioapi_0.11   R6_2.4.1          compiler_3.6.3

For this article, we have also conducted 23 semi-structured interviews, in person or virtually, lasting between one or two hours. Some interviewees have allowed us to cite their name; in the other cases, we have anonymized them, and, when citing them, we have referred to their position in the context of the citation (staff economist 1, staff economist 2, …, Executive 1, …, former MPC member 1, …).

2 Presentation of the databases

2.1 Documents of the Bank of England

The first database includes information on 4545 documents published by the Bank of England (BoE) that we have scraped from its website, with typical information such as date, authors, title, but also with the plain text, and, when available, an extraction of the list of references.

Among these documents, we have isolated what we consider research documents (n= 1415). The earliest was published on 1972-03-01 and the latest on 2020-02-14. Here is the breakdown of the research documents by category:

  • discussion-paper(pre-1992) and discussion-paper-tech_series(pre-1992): the working papers published between 1979 and 1992 (n= 103);
  • working-paper: the main set of research articles, published since 1993 (n= 856);
  • financial-stability-paper: a specific set of research papers which deals with financial stability, and which is mainly written by the economists of the Financial Stability Directorate (n= 44);
  • external-mpc-discussion-paper: standard research papers, but requested by external members of the Monetary Policy Committee (n= 51);
  • research-paper-in-quarterly-bulletin: articles published in the BoE quarterly bulletin as research document, and that are not duplicates of other items in our database (n= 352);
  • houblon-norman-paper-in-quarterly-bulletin: A few invited research articles (n= 9).

The database also includes speeches published by the BoE (earliest 1972-03-01; latest 2020-03-10):

  • speech: the main set of speeches (n= 1082);
  • speech-in-quarterly-bulletin: Speeches published in the quarterly bulletin. These speeches are not duplicates of other items in our database (n= 229).

1385 of the other 1819 documents are other items in the Quarterly Bulletin.

2.2 Prosopographic database

The second database contains prosopographic information about BoE economists. In other words, it is a collective biography aiming at uncovering shared characteristics across individuals. Data came from a systematic search in published information about the selected individuals. We have included in the database all the individuals who meet at least one of the two following criteria:

  • Having (co-)authored at least 3 Bank of England research documents (publications in external journals are not counted; see above for the categories of internal research documents);
  • Having published at least 1 “discussion paper” between 1979 and 1992. As the Bank counted fewer publications and economists in the 1980s, this was needed to have a larger sample of 1980s BoE economists.

Using these criteria in early 2020 gave us a selection of 368 individuals. The information collected on them include:

  • Academic training: the degrees obtained by the individuals, and the place where these degrees were obtained.
  • BoE career: the dates of entering and leaving the Bank, the different units (Directorates, Divisions, etc.) the individuals where affiliated to within the BoE, and the period of affiliation to these units.

3 Figures and Tables

This section uses:

  1. The databases described above;
  2. Information from articles published in external journals by researchers affiliated with central banks (this type of information comes from Web of Science);
  3. Archival material, especially for the organizational structures.

Most figures are shown in color below. The code also produces black and white versions of the figures appearing in the main article (for the print version of the article).

3.1 Organizational structure of the Bank of England

Over the period under investigation, the Bank’s organizational chart evolved numerous times. However, as a rough approximation, it consists of three layers: the Governor and the Deputy Governor(s); Directorates, chaired by Executive Directors, reporting to the Governor and Deputy Governor(s); Divisions, chaired by Heads of Division, reporting to an Executive Director. The two Figures below represent this structure.

Figure A1: Stylized organizational structure of the Bank of England in the 1970s and 1980s. Ellipses indicate omitted unnecessary details (i.e. roles or departments/divisions that are not mentioned in the article). The bidirectional arrow indicates an advising relationship.
Figure A2: Stylized organizational structure of the Bank of England in the 1990s and 2000s.

3.2 PhD and non-PhD recruitment at the BoE

The prosopographic database allows us to compute different statistics about the profile of BoE economists. By taking the date of entry of all the BoE economists in our database, we can compute for each year how many of the economists hired had a PhD. Figure A3 below displays, for 4-year periods, the percentage of individuals with a PhD among the individuals hired.

PhD_per_year <- Prosopo_Individuals[BoE_Agent == 1,.(PhD > 0,BoE1_start_date)]
PhD_per_year <- na.omit(PhD_per_year, cols="V1")
PhD_per_year$BoE1_start_date <- as.Date(PhD_per_year$BoE1_start_date)
PhD_per_year$BoE1_start_date <- format(PhD_per_year$BoE1_start_date, "%Y")

for (i in seq(1976, 2019, 4))
{
  PhD_per_year[between(PhD_per_year$BoE1_start_date, i, i+3), window := paste0(i,"-",i+3)]
}

PhD_per_year <- PhD_per_year[,total:= .(.N), keyby=c("window")]
PhD_per_year <- PhD_per_year[,N_PhD:= .(.N), keyby=c("V1","window")]
PhD_per_year <- na.omit(PhD_per_year, cols = "window")
PhD_per_year <- PhD_per_year[,P_PhD:= round((N_PhD/total)*100)]
PhD_per_year <- unique(PhD_per_year[,c("V1","window","P_PhD")])
PhD_per_year$PhD <- "NA"
PhD_per_year[V1 == "FALSE"]$PhD <- "No PhD"
PhD_per_year[V1 == "TRUE"]$PhD <- "PhD"


size_x_and_y = 14; title_boost_factor = 1.2
p <- ggplot(PhD_per_year, aes(x=window, y=P_PhD, fill = PhD)) +
  geom_bar(width = 0.9, stat = "identity"#,alpha= .8
           ) + 
  theme_minimal() +
  labs(#title = "Recruitment of BoE economists with a PhD", 
    x = "Years of recruitment", 
    y = "Share of recruited economists\nwith or without a PhD") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size =size_x_and_y),
        axis.text.y = element_text(size =size_x_and_y),
        axis.title = element_text(size=size_x_and_y*title_boost_factor),
        legend.text = element_text(size=size_x_and_y),
        legend.title = element_text(size=size_x_and_y*title_boost_factor)) + scale_y_continuous(labels = scales::percent_format(scale = 1))


fig_name <-  paste0("FigA", sprintf("%02d", fig_i), "_PhD_recruitment.jpeg")

ggsave(paste0(article_picture_folder,fig_name), plot = p + scale_fill_viridis(discrete=TRUE, end = .6, name = "Degree") )


ggsave(paste0(article_picture_folder,string_for_BW_print_version, fig_name), p + scale_fill_grey(name = "Degree"))
Figure A3: Recruitment of economists with a PhD at the Bank of England (1976-2019).

3.3 Geographical location of Master and PhD degrees of BoE economists

We then distinguish the geographical location of the Master and PhD degrees obtained by BoE economists. Figure A4 below shows how the geography of BoE recruitments evolved over time. Even if British degrees remain the norm, the BoE began to open to economists trained overseas after 1992. Interestingly, since 2012, UK universities awarded less than half of the degrees earned by the recruited economists.

# MAster and PhD places
MasterandPhD <- Prosopo_Individuals[
  BoE_Agent == 1, .(ID, Master_Place, Master_bis_place, Master_ter_place, PhD_Place)]
MasterandPhD <- reshape2::melt(MasterandPhD, id.vars = "ID")
MasterandPhD <- na.omit(MasterandPhD,cols="value")
MasterandPhD <- MasterandPhD[value!="",]
MasterandPhD1 <- MasterandPhD[, .N, keyby = .(value)]
MasterandPhD1 <- MasterandPhD1[N >= (3/100)*MasterandPhD[,.N]]
MasterandPhD1 <- MasterandPhD1[order(-N)]
MasterandPhD_percentage <- MasterandPhD1[,round((N/MasterandPhD[,.N])*100,1)]

# Annualizing data
MasterandPhD_per_year <- Prosopo_Individuals[BoE_Agent == 1, .(ID, BoE1_start_date)]
MasterandPhD_per_year <- merge(MasterandPhD,MasterandPhD_per_year, by="ID")
MasterandPhD_per_year <- merge(
  MasterandPhD_per_year[,value,BoE1_start_date],
  Universities[,Institution_Name,Location], by.x="value", by.y="Institution_Name")
MasterandPhD_per_year$BoE1_start_date <- as.Date(MasterandPhD_per_year$BoE1_start_date)
MasterandPhD_per_year$BoE1_start_date <- 
  format(MasterandPhD_per_year$BoE1_start_date, "%Y")

for (i in seq(1980, 2019, 4))
{
  MasterandPhD_per_year[between(MasterandPhD_per_year$BoE1_start_date, i, i+3), 
                        window := paste0(i,"-",i+3)]
}

MasterandPhD_per_year <- MasterandPhD_per_year[,total := .(.N), keyby= "window"]
MasterandPhD_per_year <- MasterandPhD_per_year[,N_Geo := .(.N), 
                                               keyby=c("Location","window")]
MasterandPhD_per_year <- na.omit(MasterandPhD_per_year, cols = "window")
MasterandPhD_per_year <- na.omit(MasterandPhD_per_year, cols = "BoE1_start_date")
MasterandPhD_per_year <- MasterandPhD_per_year[,P_Geo := #round(
                                                 (N_Geo/total)*100#)
                                               ]
MasterandPhD_per_year1 <- unique(MasterandPhD_per_year[,c("Location","window","P_Geo")])

MasterandPhD_per_year1$Location <- factor(
  MasterandPhD_per_year1$Location, 
  levels = rev(c("UK", "Europe", "North America", "Other")))

p <- ggplot(MasterandPhD_per_year1, aes(x=window, y=P_Geo, fill = Location)) +
  geom_bar(width = 0.9, stat = "identity") +
  labs(#title = "Geographical location of Master and PhD degrees of BoE economists", 
    x = "Years of recruitment", 
    y = "Share of postgraduate degrees"# coming from this geographical area"
       ) + theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1, size =size_x_and_y),
        axis.text.y = element_text(size =size_x_and_y),
        axis.title = element_text(size=size_x_and_y*title_boost_factor),
        legend.text = element_text(size=size_x_and_y),
        legend.title = element_text(size=size_x_and_y*title_boost_factor)) + scale_y_continuous(labels = scales::percent_format(scale = 1))


fig_name <-  paste0("FigA", sprintf("%02d", fig_i), "_Degrees_location.jpeg")

ggsave(paste0(article_picture_folder,fig_name), 
       p +
 # theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  scale_fill_viridis(discrete = TRUE , begin = 0,end = .7#, name = "Degree" 
                     )
 )


ggsave(paste0(article_picture_folder,string_for_BW_print_version, fig_name), 
       p + scale_fill_grey())
Figure A4: Geographical location of Master and PhD degrees of economists working at the Bank of England (1980-2019).

3.4 Where BoE economists graduated

Figure A5 below displays the five most common universities where BoE economists got their Master and PhD. The indicated percentages give the ratio of the number of degrees distributed by this university to the total number of degrees in the database. Half of all Master and PhD degrees were obtained in these five institutions.

size_x_and_y = 14; title_boost_factor = 1.2
p <- ggplot(MasterandPhD1, aes(x = value, y=N, fill=value)) +
  geom_bar(width = 0.9, stat = "identity") +
  geom_text(aes(label=paste0(MasterandPhD_percentage,"%")), vjust=-0.3, #colour="white", 
             size=6) + theme_minimal()+ ylim(0, 102)  +
  theme(legend.position="none", axis.text.x = element_text(angle = 45, hjust = 1, 
    size =size_x_and_y),
        axis.text.y = element_text(size =size_x_and_y),
        axis.title = element_text(size=size_x_and_y*title_boost_factor))  +
  scale_x_discrete(limits = MasterandPhD1$value) +
  labs(#title = "Top 5 Institutions for Master and PhD degrees of BoE Economists", 
    x = "",
    y = "Number of postgraduate degrees") 



fig_name <-  paste0("FigA", sprintf("%02d", fig_i), "_Degrees_institutions.jpeg")

ggsave(paste0(article_picture_folder,fig_name),
       p +
  scale_fill_viridis(discrete = TRUE) )

ggsave(paste0(article_picture_folder,string_for_BW_print_version, fig_name), 
       p + scale_fill_grey())
Figure A5: Top 5 institutions for Master and PhD degrees of Bank’s Economists (1960-2019). Y-axis reports absolute numbers, while tags report percentages. Total percentage is >100% since both master and PhD locations are reported for some economists (and some can have two masters)

In Figure A6, we study the evolution over time of the universities where BoE economists got their Master and Ph.D. degrees. We have selected the top five universities for each decade between 1970 and 2020 (at the condition that at least two degrees were distributed in the university in the decade). We have identified 8 universities, but we have only kept the five universities that seem the most important on the whole period. For the 5 remaining universities, we have calculated the share of degrees distributed in each university, over a four year period.

MasterandPhD_bis <- Prosopo_Individuals[
  BoE_Agent == 1, .(ID, Master_Place, Master_bis_place, 
                    Master_ter_place, PhD_Place,BoE1_start_date)]
MasterandPhD1 <- reshape2::melt(MasterandPhD_bis[
  ,.(ID, Master_Place, Master_bis_place, Master_ter_place, PhD_Place)],
  id.vars = "ID")
MasterandPhD1 <- MasterandPhD1[value != "NA" & value != "" ]
MasterandPhD1 <- merge(MasterandPhD1, 
                       MasterandPhD_bis[, c("ID","BoE1_start_date")],
                       by = "ID")
MasterandPhD1$BoE1_start_date <- as.Date(MasterandPhD1$BoE1_start_date)
MasterandPhD1$BoE1_start_date <- year(MasterandPhD1$BoE1_start_date)
MasterandPhD1<- MasterandPhD1[BoE1_start_date != "NA"]

top_universities <- data.table("value" = c(), "top" = c())

for(i in seq(1970,2010,10)){
  top <- unique(MasterandPhD1[between(BoE1_start_date, i, i+9)][
    , top := .N, by = "value"][order(-top), c("value","top")])
  top <- head(top[top > 1], 5)
  
  top_universities <- rbind(top_universities, top)
  }

top_universities <- unique(top_universities$value)

share_universities <- data.table("top_universities" = top_universities,
                                 "Year" = rep(1970:2020,length(top_universities)))
share_universities <- share_universities[order(top_universities,Year)]

MasterandPhD1 <- MasterandPhD1[, total_year:= .N, by = "BoE1_start_date"][
  , nb_year := .N, by = c("value","BoE1_start_date")]
#[, share_year := share_year/total_year]

share_universities <- merge(share_universities, unique(MasterandPhD1[
  ,c("value","BoE1_start_date","nb_year")]), by.x = c("top_universities","Year"), 
  by.y = c("value","BoE1_start_date"), all.x = TRUE)
share_universities[is.na(nb_year)]$nb_year <- 0
share_universities <- merge(share_universities, 
                            unique(MasterandPhD1[
                              ,c("BoE1_start_date","total_year")]), by.x = c("Year"), 
                            by.y = c("BoE1_start_date"), all.x = TRUE)
share_universities[is.na(total_year)]$total_year <- 0

top_universities <- data.table("top_universities" = c(), "Year" = c(), 
                               "Period" = c(), "nb_year" = c(), "total_year" = c(), 
                               "total_period" = c(), "nb_period" = c(),
                               "share_period" = c())

for(i in seq(1976,2012,4)){
  share_universities1 <- share_universities[
    between(Year,i, i+3), c("top_universities","Year","total_year","nb_year")][
      , total_period := sum(total_year), by = "top_universities"][
        ,nb_period := sum(nb_year), by = "top_universities"][
          , share_period := nb_period/total_period]
  share_universities1$Period <- paste0(i,"-",i+3)
  #share_universities1 <- share_universities1[,c("top_universities")]
  
  top_universities <- rbind(top_universities,share_universities1)
}

top_universities <- top_universities %>%
  select(top_universities,share_period,Period) %>%
  unique() %>%
  filter(top_universities != "London" & 
           top_universities != "Queen Mary"  & top_universities != "UCL")

p <- ggplot(top_universities, aes(x = Period, y = share_period,
                             group = top_universities, color = top_universities,
                             lty =top_universities
                             )) +
  geom_smooth(show.legend = FALSE, size = 1.5, span=.6, se =FALSE)  +
  geom_dl(data =
            top_universities#[top_universities %in% c("LSE","Cambridge","Birkbeck")]
          , aes(label = top_universities), 
          method = list("last.bumpup", 
                        dl.trans(x=x+0.1)#, cex = .9#, vjust = .4
                        )) +
#  geom_dl(data = top_universities[top_universities %in% c("Oxford","Warwick")], 
#  aes(label = top_universities), method = list("last.bumpup", dl.trans(x=x+0.1), 
#   cex = 1.2, vjust = .2)) +
  coord_cartesian(clip = 'off') + 
  theme_minimal() +
  theme(legend.position="none", 
        axis.text.x = element_text(angle = 45, hjust = 1),
        axis.text = element_text(size = size_x_and_y),
        plot.margin = margin(0.1, 2.5, 0.1, 0.1, "cm"),
        title = element_text(size = size_x_and_y*title_boost_factor),
        plot.subtitle = element_text(size = size_x_and_y)) +
  labs(y = "Share of postgraduate degrees", x = "Years of recruitment") + 
  scale_y_continuous(labels = scales::percent_format(accuracy = 1))

  

fig_name <-  paste0("FigA", sprintf("%02d", fig_i), "_Top_institutions_through_time.jpeg")

 ggsave(paste0(article_picture_folder,fig_name), 
        p +
  scale_color_viridis(discrete = TRUE, end =.9)  + 
  scale_linetype_manual(values = rep(1, times = nrow(top_universities)))
 )

ggsave(paste0(article_picture_folder,string_for_BW_print_version, fig_name), 
       p + scale_color_grey() + scale_linetype_manual(values = c(1,2,1,1,2)))


 #sum <-top_universities[, Sum := sum(share_period), by = "Period"]
 #sum <- unique(sum[, c("Period","Sum")])
Figure A6: Master and PhD origins of economists recruited at each period (smoothed)

The following table sums up the number of individuals of our database entering the BoE for each period, and the corresponding number of degrees (individuals can have several masters and a PhD, meaning that one individual can have several degrees).

MasterandPhD_bis <- Prosopo_Individuals[
  BoE_Agent == 1, .(ID, Master_Place, Master_bis_place, 
                    Master_ter_place, PhD_Place,BoE1_start_date)]
MasterandPhD1 <- reshape2::melt(
  MasterandPhD_bis[,.(ID, Master_Place, Master_bis_place, 
                      Master_ter_place, PhD_Place)], id.vars = "ID")
MasterandPhD1 <- MasterandPhD1[value != "NA" & value != "" ]
MasterandPhD1 <- merge(MasterandPhD1, MasterandPhD_bis[
  , c("ID","BoE1_start_date")], by = "ID")
MasterandPhD1$BoE1_start_date <- as.Date(MasterandPhD1$BoE1_start_date)
MasterandPhD1$BoE1_start_date <- year(MasterandPhD1$BoE1_start_date)

econ_per_year <- unique(MasterandPhD1[,c("ID","value","BoE1_start_date")])
econ_per_year <- unique(econ_per_year[
  , nb_degrees := .N, by = "BoE1_start_date"][
    ,c("ID","BoE1_start_date","nb_degrees")])
econ_per_year <- econ_per_year[
  , nb_economists := .N, by = "BoE1_start_date"][
    BoE1_start_date != "1889",]
econ_per_year <- unique(econ_per_year[
  order(BoE1_start_date), c("BoE1_start_date","nb_degrees","nb_economists")])

econ_per_period <- data.table("BoE1_start_date" = c(),"nb_degrees" = c(),
                              "nb_economists" = c(),"nb_econ" = c(),
                              "nb_degrees" = c(), "Period" = c())
for(i in seq(1976,2012,4)){
  econ_per_year_bis <- econ_per_year[
    between(BoE1_start_date,i, i+3), 
    c("BoE1_start_date","nb_degrees","nb_economists")][
      , nb_econ := sum(nb_economists)][, nb_degrees := sum(nb_degrees)]
  econ_per_year_bis$Period <- paste0(i,"-",i+3)

  
  econ_per_period <- rbind(econ_per_period,econ_per_year_bis)
}

econ_per_period <- unique(econ_per_period[,c("Period","nb_econ","nb_degrees")])

# datatable(econ_per_period, rownames = FALSE, 
#           options =  list(scrollY = "400px",
#   scrollCollapse = TRUE,
#   scrollX = "300px",
#   paging = FALSE))
kable(econ_per_period, booktabs = T
      #, caption = "New arrivals at the BoE in our prosopographic database"
      ) %>% kable_styling(position = "center") #%>% kable_styling(latex_options = "striped")
Period nb_econ nb_degrees
1976-1979 4 5
1980-1983 6 7
1984-1987 5 6
1988-1991 14 15
1992-1995 27 35
1996-1999 36 47
2000-2003 67 99
2004-2007 33 49
2008-2011 52 75
2012-2015 55 91

3.5 Evolution of BoE research documents

The next graph counts the number of research documents published by the BoE over time. It differentiates the working papers (gathering the following documents of our database: working-paper, financial-stability Papers, discussion-paper-tech_series(pre-1992), discussion-paper(pre-1992), external-mpc-discussion-paper), the research documents published in the quarterly bulletin (research-paper-in-quarterly-bulletin and houblon-norman-paper-in-quarterly-bulletin) and the Articles published in external journals, listed in Web of Science, by economists affiliated to the BoE. Figure 1 below is included in the main article.

# 1 - Evolution of number of research documents through time

dt_BoE_copy <- dt_BoE # necessary to have the code still functioning 
                      # for following graph 
                      #(in the next line we will need complete date of dt_BoE)
dt_BoE_copy$date <- format(dt_BoE_copy$date, "%Y")
BoE_Research <- dt_BoE_copy[is_research_paper == 1, c("doc_ID","date","category")]
BoE_Research[category == "discussion-paper-tech_series(pre-1992)" | 
               category == "discussion-paper(pre-1992)"| 
               category == "external-mpc-discussion-paper" | 
               category == "financial-stability-paper"]$category <- "working-paper"
BoE_Research[category == "houblon-norman-paper-in-quarterly-bulletin"
             ]$category <- "research-paper-in-quarterly-bulletin"

# creating an object with all the articles published by authors affiliated to the BoE
BoE_articles <- merge(dt_articles[
  ,list(ID_Art,Year,Titre,Revue,ItemID_Ref)], dt_w_address[
    ,list(ID_Art,is_BoE_phys)], by = "ID_Art", all.x = TRUE)
BoE_articles <- merge(BoE_articles,dt_w_emails[
  ,list(ID_Art,is_BoE_email)], by = "ID_Art", all.x = TRUE)
BoE_articles <- merge(BoE_articles,dt_authors[,list(ID_Art,Nom)], by = "ID_Art")
BoE_articles <- BoE_articles[(is_BoE_phys == TRUE | is_BoE_email ==TRUE) &
                               (Year >= "1979" & Year <= "2019" ),]
BoE_articles <- BoE_articles[,c("ID_Art","Year")]
BoE_articles$category <- "WoS-articles"
colnames(BoE_articles)[1:2] <- c("doc_ID","date")
BoE_articles <- unique(BoE_articles)

# merging BoE_metadata with WoS articles
BoE_total <- rbind(BoE_Research, BoE_articles)
BoE_total <- BoE_total[, total := .(.N), by = c("date","category")]

# plotting

  # applying the clean map of categories
BoE_total <-   merge(BoE_total,categ_map, 
        by.x = "category",by.y = "old_category")
# Creating Year variable with the right type
BoE_total[, Year := as.numeric(date)]

# the graph
size_x_and_y = 14; title_boost_factor = 1.2

p <- BoE_total %>%
  filter(between(Year,1979,2018) & 
           category %in% c("research-paper-in-quarterly-bulletin",
                  "working-paper",
                  "WoS-articles")) %>%
  ggplot(aes(x = Year, y = total, group = new_category, color = new_category)) +
  geom_smooth(show.legend = FALSE, size = 1.5, span=.4, se =FALSE)  +
  geom_dl(aes(label = new_category), 
          method = list("last.points", dl.trans(x=x+0.1), cex = 1.4)) +
  coord_cartesian(clip = 'off') + theme_minimal() +
  theme(legend.position="none", 
       # axis.text.x = element_text(angle = 45, hjust = 1),
        axis.text = element_text(size = size_x_and_y),
        plot.margin = margin(0.1, 4.5, 0.1, 0.1, "cm"),
        title = element_text(size = size_x_and_y*title_boost_factor),
        plot.subtitle = element_text(size = size_x_and_y)) +
  labs(#title = "Research Output of the Bank of England", 
    y = "Number of documents", x = ""
     # subtitle = "(curves are smoothed with local polynomial regression)"
    )  

fig_name <- "Fig01_BoE_Publications.jpeg"

ggsave(paste0(article_picture_folder,fig_name), 
       p + scale_color_viridis(discrete = TRUE, end =.8)  )

ggsave(paste0(article_picture_folder,string_for_BW_print_version, fig_name), 
       p + scale_color_grey(end=.6))
Figure 1: Evolution of the research output of the Bank of England. External articles are articles published in journals listed in Web of Science, whether they are standard academic journals like the American Economic Review or the Economic Journal, or research journals published by central banks like the Federal Reserve Bank of St. Louis Review. Curves are smoothed with local polynomial regression.

3.6 Percentage of research documents published by each directorate

Thanks to the Prosopographic database, we can have information about the authors’ affiliations within the Bank of England. For instance, we are able to know if a research document has been written by someone in the Financial Stability Directorate or in the Monetary Analysis Directorate.

For Figure A7 below, we measure the share of authorship for each directorate. We first identify the affiliations of the authors of research document, when we have the information. We can then associate each document to one or several directorates. If a document has been written by two authors, one in Financial Stability, the other one in Monetary Analysis, it gives 0.5 to Financial Stability and 0.5 to Monetary Analysis. We sum the documents affiliations over all the research documents.

The graph displays for each year the share of each BoE directorate in research documents, in the three last years. We use a 3-year window in order to avoid erratic variations.

These data should be taken as approximations because 1) some authors of BoE research documents are not in the database (see the inclusion criteria above) and 2) information is sometimes missing for individuals in the database.

Due to the 2013 BoE restructuring, two divisions of the Financial Stability Directorate became independent directorates. We have merged their publications to the publications of the Financial Stability Directorate to have more continuity in our data. The corresponding curve has been called “FS and Similar”.

# calling the function to merge the prosopo database with BoE corpus metadata
BoE_Documents_Prosopo <- merging_BoE_prosopo(Prosopo_Individuals,
                                             Prosopo_BoE_Position, dt_doc_authors)
  
# creating windows to look at the evolution over time of directorates productivity

# evolution of publications by divisions and directorates through time

BoE_Documents_Prosopo$date <- as.Date(BoE_Documents_Prosopo$date)
BoE_Documents_Prosopo$date <- format(BoE_Documents_Prosopo$date, "%Y")

for (i in seq(1995, 2018, 3)){
  BoE_Documents_Prosopo[between(BoE_Documents_Prosopo$date, i, i+3), 
                        window := paste0(i,"-",i+2)]
}

# for divisions
# counting the number of documents published for each period
Documents_per_window <- unique(BoE_Documents_Prosopo[,c("doc_ID","window")])
Documents_per_window <- Documents_per_window[, n_docs:= .(.N), by = "window"]

# keeping only the most recurrent directorates
Directorates_names <-  BoE_Documents_Prosopo[Category == "4 - Directorate", .N, 
                                             keyby = .(Position)]
Directorates_names <- Directorates_names[
  N >= (1/100)*BoE_Documents_Prosopo[Category == "4 - Directorate", .N]]
Directorates_names <- Directorates_names[,Position]

Directorates_per_window <- BoE_Documents_Prosopo[Position %in% Directorates_names]
Directorates_per_window <- na.omit(Directorates_per_window, cols="window")
Directorates_per_window <- Directorates_per_window[
  , sum_weights := sum(weight_authors), by = c("window","Position")]
Directorates_per_window <- merge(
  Directorates_per_window, unique(Documents_per_window[
    ,c("window","n_docs")]), by = "window")
Directorates_per_window <- unique(
  Directorates_per_window[, p_directorate := round((sum_weights/n_docs)*100,1)])

Alt_Directorates_per_window <- Directorates_per_window[
  ,c("Position","p_directorate","window")]
Alt_Directorates_per_window[Position == "Financial Stability Directorate" | 
                              Position == "Markets Infrastructure Directorate" |
                 Position == "Prudential Policy Directorate"
                 ]$Position <- "Financial Stability and Similar"
Alt_Directorates_per_window <- as.data.table(unique(Alt_Directorates_per_window[]))
Alt_Directorates_per_window <- Alt_Directorates_per_window[
  , p_directorate := sum(p_directorate), by = c("window","Position")]


Directorates_per_window[
  Position == "Financial Stability Directorate", Position := "Financial Stability"]

plot_data <-  unique(rbindlist(
  list(Directorates_per_window[,list(Position, p_directorate, window)], 
       Alt_Directorates_per_window[Position == "Financial Stability and Similar"]), 
  fill = TRUE)
)
plot_data <-  plot_data[Position %in% c(unique(Alt_Directorates_per_window$Position),
                                        "Financial Stability")]
name_FS_and_sim <- "FS and Similar"
plot_data[Position == "Financial Stability and Similar", Position := name_FS_and_sim]
plot_data <-  plot_data[!Position %in% ("Financial Stability")]


p <- ggplot(plot_data,
       aes(x = window, y = p_directorate, colour = Position, group = Position)) + 
  geom_line(size = 1.5) +
  scale_color_viridis(discrete = TRUE, end =.8) + theme_minimal() +
     coord_cartesian(ylim =c() , clip = 'off') +
   geom_dl(plot_data[Position != "Monetary Analysis"] ,mapping = aes(label = Position),
           method = list("last.bumpup", dl.trans(x=x+0.2) , cex = 1.3)) +
     geom_text(plot_data[Position == "Monetary Analysis" & window == "2007-2009"],
               mapping = aes(label = Position), hjust = -0.1, vjust = -.1, size =6) +
  theme(legend.position="none", 
        axis.text.x = element_text(angle = 45, hjust = 1),
        axis.text = element_text(size = size_x_and_y),
        plot.margin = margin(0.1, 4.5, 0.1, 0.1, "cm"),
        title = element_text(size = size_x_and_y*title_boost_factor),
        plot.subtitle = element_text(size = size_x_and_y)) +
  labs(#title = "Repartition of BoE Affiliations for Research Papers", 
    y = "Share of affiliations to the\ndifferent BoE directorates", x = ""#,
    #subtitle = "('FS and Similar' includes three directorates: Financial Stability,
    # Prudential Policy and Financial Markets Infrastructure)"
    )

fig_name <-  paste0("FigA", sprintf("%02d", fig_i), "_affil_to_BoE_directorates.jpeg")

ggsave(paste0(article_picture_folder,fig_name), 
       p  )
Figure A7: Percentage of research documents published by each directorate.

3.7 Citations to in-house research in speeches: by directorates

We have detected the explicit references to internal research papers in the speeches. Here, we present the extent to which the two major research directorates (Monetary Analysis and Financial Stability) tend to be cited by BoE spokespersons. The year (x-axis) refers to the year of publication of the research document, not the speech. Note also that this representation takes into account only whether or not a research document is cited in at least one speech, not the number of times it is cited. Finally, in Figure A8, we use again the method used above to weigh the authorship of directorates for each document.

# The references from speeches to internal research
refs_speeches <- dt_ref[citing_doc_ID %in% dt_BoE[
  grepl("speech",category),unique(doc_ID)] &
         cited_doc_ID %in% dt_BoE[is_research_paper == TRUE,unique(doc_ID)],]

# Adding a column in our metadata signaling 
# if each document is in the citing-cited network of speeches:
dt_BoE[, in_network := ifelse(
  doc_ID %in% unique(c(refs_speeches$citing_doc_ID,refs_speeches$cited_doc_ID)),
                                TRUE,FALSE)]
dt_BoE[,year:=year(date)] # and another column with only the year to make things simpler

# Merging researchers with research paper metadata
researchers_cit <- merge(
  dt_doc_authors,dt_BoE[is_research_paper == TRUE], by ="doc_ID", 
  all.y = TRUE, all.x= FALSE) %>% data.table()
# removing few cases where there are authors, but one is NA
researchers_cit <- researchers_cit[
  !(is.na(name)  & 
      (!doc_ID %in% dt_BoE[is_research_paper == TRUE & is.na(authors),doc_ID]))  ]
# Clean NA fields
researchers_cit[is.na(name), name:= n_none]

# Loading affiliations in the new version of the prosopographic data.
dt_indiv_affiliations <- readRDS(
  file = paste0(paths[1,"path"],path_BoE,"dt_indiv_affiliations.rds")) %>% data.table()




boe_position <- dt_indiv_affiliations[
  Institution_Name == "Bank of England" & 
    author_ID %in% unique(na.omit(researchers_cit$author_ID))]

# Getting directorate info
directorate_cit <- merge(researchers_cit[,list(doc_ID,author_ID,date)], 
                       boe_position[Unit_level == "4 - Directorate",
                                    list(author_ID,Unit_name,
                                         Start_date = as.Date(Start_date),
                                         End_date = as.Date(End_date))],
                       by= "author_ID", allow.cartesian = T)
#directorate_cit[is.na(Start_date)]
directorate_cit <- directorate_cit[between(date,Start_date,End_date) ] 

# Bringing this information back into the full table
directorate_cit <- merge(
  researchers_cit, 
  directorate_cit[,list(doc_ID,author_ID,Unit_name)], 
  by = c("doc_ID","author_ID"), all.x=TRUE)


# Preparing the table
directorate_cit[is.na(Unit_name), Unit_name := "Unknown"]
directorate_cit[, nb_authors := .N, by = doc_ID]
# adding an 'Extended financial stability' directorate, 
# which brings back under its wing smaller divisions that have split recently:
k = 3
smaller_dir_linked_to_FS <- c("Markets Infrastructure Directorate",
                              "Prudential Policy Directorate")
extended_directorate <- copy(directorate_cit[
  Unit_name %in% c("Financial Stability Directorate", smaller_dir_linked_to_FS)])
min_year_for_ext_dir <- extended_directorate[
  Unit_name %in% smaller_dir_linked_to_FS, date %>% min() %>% year()]
extended_directorate <- extended_directorate[
  year(date) >= floor(min_year_for_ext_dir-k/2)] # constraining to period 
                                                #  when division occurred
name_extended_dir <- "Financial Stability and Similar"
extended_directorate[,Unit_name := name_extended_dir]
directorate_cit <-  rbindlist(list(directorate_cit,extended_directorate))

# Computing the proportion of cited research for each directorate in each article
#directorate_cit$Unit_name %>% unique()
directorate_cit <- directorate_cit[, list(prop_directorate = .N/unique(nb_authors),
                       date = unique(date), year = unique(year),
                       in_network = unique(in_network)), by = .(doc_ID,Unit_name)]


# Proportion of weighted authorship that are cited in each directory
plot_data <- directorate_cit[
  Unit_name %in% c("Financial Stability Directorate", "Monetary Analysis", 
                   name_extended_dir), 
  list(weighted_prop_cited = 
         sum(ifelse(in_network,prop_directorate,0))/sum(prop_directorate),
                                    weighted_count = sum(prop_directorate) 
       ), by= .(year,Unit_name)]
plot_data[Unit_name == "Financial Stability Directorate", 
          Unit_name:= "Financial Stability"]

# Computing the 3-year moving averages for smoothing
setkey(plot_data,Unit_name,year)
plot_data[,moving_average_prop_cited := 
            rollmean(weighted_prop_cited, k =3,  na.pad=TRUE),by= Unit_name]

# New Plotting
plot_data <- merge(plot_data, 
      data.table(Unit_name = c("Financial Stability",
                               "Monetary Analysis"),
                 Directorate =
                   c("Financial Stability\nDirectorate",
                     "Monetary Analysis\nDirectorate")),
      by = "Unit_name", all.x = TRUE)
plot_data$Unit_name <- factor(plot_data$Unit_name, 
                              levels = c("Monetary Analysis", 
                                         "Financial Stability", 
                                         "Financial Stability and Similar"))

p <- ggplot(plot_data[between(year,2000,2018) & !is.na(Directorate)], 
       aes(x=year, y = weighted_prop_cited, color = Unit_name)) + 
  geom_smooth(size = 1.5, se = FALSE, span = .4)  + 
  #geom_smooth(aes(y=weighted_prop_cited), se = FALSE,span=.25) +
 # geom_line(aes(y=weighted_prop_cited)) +
   labs(y="Proportion of research documents",# that are cited"
         x = "Year of the research", color= "Directorate"
        #,title="Proportion of in-house research being\n
        #cited at least once in speeches",
        #subtitle= "(curves are smoothed using local polynomial regression)"
        )  +
#  geom_dl(aes(label = Directorate), method = list("first.bumpup", 
  # dl.trans(x=x+0.1), cex = 1.4)) +
  coord_cartesian(clip = 'off') + 
  theme_minimal() +
  theme(#legend.position="none", 
       # axis.text.x = element_text(angle = 45, hjust = 1),
        axis.text = element_text(size = size_x_and_y),
        #plot.margin = margin(0.1,0.1, 0.1,  4.5, "cm"),
        title = element_text(size = size_x_and_y*title_boost_factor),
        plot.subtitle = element_text(size = size_x_and_y),
        legend.text = element_text(size=size_x_and_y),
        legend.title = element_text(size=size_x_and_y*title_boost_factor)) 


fig_name <-  paste0("FigA", sprintf("%02d", fig_i),
                    "_Speech_citations.jpeg")

 ggsave(paste0(article_picture_folder,fig_name),
        p +  scale_color_viridis(discrete=TRUE, end = .6))

ggsave(paste0(article_picture_folder,string_for_BW_print_version, fig_name), 
       p + scale_color_grey(end=.6))
Figure A8: Proportion of in-house research being cited at least once in speeches. Curves are smoothed using local polynomial regression.

3.8 Citation analysis using Web of Science data

In Web of Science, we identified articles published by economists affiliated to various Central Banks (CB). To detect these documents in Web of Science, we determined through a lengthy iterative procedure a list of strings that match either the address or the email of an author from a central bank. This procedure gave us 19,811 documents from 1970 to 2018.

Figure A9 shows a distribution through time of these publications for the BoE, the US Federal Reserve and all the other central banks.

# Finding the Fed (the regex were defined after serious iterations)
s <- "FED-RES|FRB|RESERVE-BANK-NEW-YORK|BOARD-GOVERNORS|FEDERAL-RESERVE|FEDERAL-RES-BOARD|FED-BANK"
dt_w_address[
  grepl(s, Institution), 
             Bank_phys := "Fed"]
dt_w_address[is_BoE_phys == TRUE, Bank_phys := "Bank of England"]
dt_w_address[is.na(Bank_phys), Bank_phys := "Other"]
dt_w_emails[!is.na(Courriel),end_e:=  unlist(strsplit(Courriel[1],"@"))[2],by=Courriel]        
dt_w_emails[grepl("frb.gov|frbchi.org|frb.org|frb.fed", end_e), Bank_email := "Fed" ]
dt_w_emails[is_BoE_email == TRUE, Bank_email := "Bank of England"]
dt_w_emails[is.na(Bank_email), Bank_email := "Other"]


dt_boe_or_not <- merge(dt_articles[,list(ID_Art,Year,Citations, Code_Document)], 
                       dt_w_address[,list(ID_Art,is_BoE_phys, Bank_phys)],
                       by="ID_Art",all=T) %>% unique()

dt_boe_or_not <- merge(dt_boe_or_not, 
                       dt_w_emails[,list(ID_Art,is_BoE_email, Bank_email)],
                       by="ID_Art",all=T) %>% unique()
# These two merges create multiple lines about the same ID_Art. 
# We collapse them into one, which will give us whether there is at least 
# one author from the BoE 
# and whether there is one from another bank.

dt_boe_or_not <- dt_boe_or_not[
  , list(Year= unique(Year), Citations = unique(Citations), 
         Code_Document = unique(Code_Document),
         is_BoE = ifelse(any(is_BoE_phys == TRUE | is_BoE_email == TRUE,na.rm = TRUE),
                         TRUE,FALSE),
         is_other1 = ifelse(any(is_BoE_phys == FALSE | is_BoE_email == FALSE),
                            TRUE,FALSE), # Other including FED
         is_Fed = ifelse(any(Bank_phys ==  "Fed" | Bank_email ==  "Fed",na.rm = TRUE),
                         TRUE,FALSE),
         is_other2 = ifelse(any(Bank_phys ==  "Other" | Bank_email ==  "Other",
                                na.rm = TRUE),TRUE,FALSE) # Other excluding Fed
                     ), by = ID_Art]

# Dropping documents that are not articles, notes or reviews
dt_boe_or_not <- dt_boe_or_not[Code_Document < 4]


# producing the figure
dt_plot <- rbind( dt_boe_or_not[is_BoE == TRUE ,
                                list(Bank = "Bank of England", nb =.N),
                                by = Year],
                  dt_boe_or_not[is_Fed == TRUE , 
                                list(Bank = "Federal Reserve", nb =.N),
                                by = Year],
                  dt_boe_or_not[is_other2 == TRUE , 
                                list(Bank = "All other central banks", nb =.N),
                                by = Year]
                  )
dt_plot$Bank <- factor(dt_plot$Bank,levels = unique(dt_plot$Bank) %>% 
                         sort(decreasing = T))

#setnames(dt_plot,"nb", "Number of articles")

p <- ggplot(dt_plot[Year %in% timespan], aes(x=Year,y=nb, color = Bank)) +
  geom_line(lwd = linewidth) +
  # ggtitle("Number of articles published annually by different central banks") +
  ylab("Number of articles")


fig_name <-  paste0("FigA", sprintf("%02d", fig_i), "_CB_articles_in_WoS.jpeg")

ggsave(paste0(article_picture_folder,fig_name), 
       p  )
Figure A9: Number of articles published annually by different central banks (Web of Science data).

One might wonder what are the most productive central banks among the “all others.” Bellow are the most frequent institutions in the address field and the most frequent domains in the email field.

dt_w_address[Bank_phys == "Other", .N, by= Institution][order(-N)] %>% 
  head(5) %>% kable(booktabs = T) %>% kable_styling(position = "center")
Institution N
EUROPEAN-CENT-BANK 1148
BANK-CANADA 524
BANK-ITALY 521
DEUTSCH-BUNDESBANK 498
BANQUE-FRANCE 481
dt_w_emails[Bank_email == "Other", .N, by= end_e][order(-N)] %>%
  head(5) %>% kable(booktabs = T) %>% kable_styling(position = "center")
end_e N
ecb.int 641
bancaditalia.it 595
ecb.europa.eu 386
bundesbank.de 385
dnb.nl 319

The institutions in the Eurozone thus dominate this category, with the Bank of Canada also making the list.

3.8.1 Citations by central banks

Figure A10 displays the most-cited articles, for each sub-period, by articles from BoE authors and by articles from authors affiliated to all the other central banks (Fed included).

# fixing different variables:
  # the starting dates of our different subperiod
  # the ending dates

start_date = c(1979,1992,2000,2008,2014)
end_date = c(1991,1999,2007,2013,2019)

########## Creating the Boe Articles list, and their references #########--------
# creating an object with all the articles published by authors affiliated to the BoE
BoE_articles <- merge(dt_articles[
  ,list(ID_Art,Year,Titre,Revue,ItemID_Ref)],
  dt_w_address[,list(ID_Art,is_BoE_phys)], by = "ID_Art", all.x = TRUE)
BoE_articles <- merge(BoE_articles,
                      dt_w_emails[,list(ID_Art,is_BoE_email)], 
                      by = "ID_Art", all.x = TRUE)
BoE_articles <- merge(BoE_articles,dt_authors[,list(ID_Art,Nom)], by = "ID_Art")
BoE_articles <- BoE_articles[(is_BoE_phys == TRUE | is_BoE_email ==TRUE) & 
                               (Year >= "1979" & Year <= "2019" ),]
BoE_articles <- unique(BoE_articles)
BoE_articles <- BoE_articles[,c("Nom","ID_Art","Titre","Year","Revue","ItemID_Ref")]

# getting the list of references cited by these BoE articles
BoE_refs <- merge(unique(BoE_articles[,c("ID_Art","Year")]),
                  dt_references[
                    ,c("ID_Art", "ItemID_Ref", "Nom","Revue_Abbrege","Annee")], 
                  by="ID_Art") 

# cleaning heavier objects
#rm(dt_articles)
#rm(dt_authors)
#rm(dt_references)

# cleaning the references to have correct ItemID_Ref
# for the moment: only keeping references with an ItemID_Ref

BoE_refs_reduce <- BoE_refs[ItemID_Ref != "NULL"]
####### Comparisons between BoE publications and other central Banks ####

# creating an object with all the articles published by authors 
# affiliated to other central banks than BoE
non_BoE_articles <- merge(dt_articles[
  ,list(ID_Art,Year,Titre,Revue,ItemID_Ref)],
  dt_w_address[,list(ID_Art,is_BoE_phys)], by = "ID_Art", all.x = TRUE)
non_BoE_articles <- merge(non_BoE_articles,
                          dt_w_emails[,list(ID_Art,is_BoE_email)],
                          by = "ID_Art", all.x = TRUE)
non_BoE_articles <- merge(non_BoE_articles,
                          dt_authors[,list(ID_Art,Nom)], by = "ID_Art")
non_BoE_articles <- non_BoE_articles[
  (is_BoE_phys == FALSE | is_BoE_email == FALSE) &
    (Year >= "1979" & Year <= "2019" ),]
non_BoE_articles <- non_BoE_articles[
  ,c("Nom","ID_Art","Titre","Year","Revue")]
non_BoE_articles <- unique(non_BoE_articles)

# getting the list of references cited by these BoE articles
non_BoE_refs <- merge(unique(
  non_BoE_articles[,c("ID_Art","Year")]),
  dt_references[
    ,c("ID_Art", "ItemID_Ref", "Nom","Revue_Abbrege","Annee")],
  by="ID_Art") 
non_BoE_refs_reduce <- non_BoE_refs[ItemID_Ref != "NULL"]

# Creating the function to have the 20 most cited references by BoE 
# and other CB for each period
most_cited <- data.frame("ItemID_Ref" = c(),"Nom"=c(), "Revue_Abbrege" = c(),
                         "Annee" = c(), "n_cit" = c(), "Bank" = c(), "Period" = c())

for(i in 1:length(start_date)){
  non_BoE_most_cited <- non_BoE_refs_reduce[between(Year,start_date[i],end_date[i])]
  non_BoE_most_cited <- non_BoE_most_cited[,n_cit := .(.N), by = "ItemID_Ref"]
  non_BoE_most_cited <- unique(non_BoE_most_cited[
    order(-n_cit), c("ItemID_Ref","Nom","Revue_Abbrege","Annee","n_cit")])
  doublons <- which(duplicated(non_BoE_most_cited$ItemID_Ref))
  non_BoE_most_cited <-  non_BoE_most_cited[-doublons]
  non_BoE_most_cited <- head(non_BoE_most_cited, 20)
  non_BoE_most_cited$Rank <- 1:length(non_BoE_most_cited$ItemID_Ref)
  non_BoE_most_cited$Bank <- "Other Central Banks"
  non_BoE_most_cited$Period <- paste0(start_date[i],"-",end_date[i])
  
  BoE_most_cited <- BoE_refs_reduce[between(Year,start_date[i],end_date[i])]
  BoE_most_cited <- BoE_most_cited[,n_cit := .(.N), by = "ItemID_Ref"]
  BoE_most_cited <- unique(BoE_most_cited[
    order(-n_cit), c("ItemID_Ref","Nom","Revue_Abbrege","Annee","n_cit")])
  doublons <- which(duplicated(BoE_most_cited$ItemID_Ref))
  BoE_most_cited <-  BoE_most_cited[-doublons]
  BoE_most_cited <- head(BoE_most_cited, 20)
  BoE_most_cited$Rank <- 1:length(BoE_most_cited$ItemID_Ref)
  BoE_most_cited$Bank <- "Bank of England"
  BoE_most_cited$Period <- paste0(start_date[i],"-",end_date[i])

  
  most_cited <- rbind(most_cited,BoE_most_cited,non_BoE_most_cited)
  
}

most_cited <- most_cited[, Nom := gsub("-.*","",Nom)][
  , Label := paste0(Rank,"-",Nom,"-",Annee)]

# plotting the most cited articles for each period, by BoE and non-BoE articles

p <- most_cited%>%
mutate(Label = reorder_within(Label, Rank, within = list(Bank,Period))) %>%
ggplot(aes(x=Label, y=n_cit, fill= Period
           )) +
geom_bar(stat="identity", alpha = .8, show.legend = FALSE) +
 facet_wrap(Period ~ Bank,  nrow =  5, scales = "free",
            labeller =  label_wrap_gen(width = 35,  multi_line=FALSE)) +
# coord_flip() +
 scale_x_reordered() +
#scale_y_continuous(expand = c(0,0)) +
theme(strip.text.x = element_text(
       size = 9, face = "bold"),
       axis.text.x = element_text(size = 7, angle=45,  hjust=1), 
      axis.text.y = element_text(size = 4)) +
 xlab(""#"Most cited references by articles from BoE and other central banks"
      ) +
 ylab("Number of citations") 



fig_name <-  paste0("FigA", sprintf("%02d", fig_i),
                    "_Citation_comparison.jpeg")

ggsave(paste0(article_picture_folder,fig_name),
       p +   scale_fill_viridis(discrete = TRUE),
       width=7, height=10, units = "in")

ggsave(paste0(article_picture_folder,string_for_BW_print_version, fig_name), 
       p + scale_fill_manual(values = rep("black",times = length(unique(most_cited$Period)))), 
       width=7, height=10, units = "in")
Figure A10: Most-cited articles in research by the Bank of England vs other central bank economists.

Figure A11 takes the twenty most-cited references by BoE articles and by non-BoE articles (for each sub-period), and runs a chi2 test to check if the two samples are independent. The graph displays the residuals for each reference: a positive residual means that the reference is over-cited by BoE articles, in comparison to its citations by other CB articles. The number of references for each sub-period (for instance, 35 for the first sub-period) indicates the degree of conformity of BoE references with other CB. The maximum number of references is 40 (if the 20 most-cited references by the BoE are all different from the 20 most-cited references by other CB). The smaller the number of references, the larger the conformity of the BoE with what is most cited by other CB.

# running chi-2 test for the 20 most cited references of each corpus

Refs_chi2 <- data.frame("ItemID_Ref" = c(),"Nom"=c(), "Revue_Abbrege" = c(),
                         "Annee" = c(), "n_cit_non_BoE" = c(), "n_cit_BoE" = c(),
                        "chi2_residual" = c(), "Rank" = c(), "type" = c())


for(i in 1:length(start_date)){
  non_BoE_most_cited <- non_BoE_refs_reduce[between(Year,start_date[i],end_date[i])]
  non_BoE_most_cited <- non_BoE_most_cited[,n_cit := .(.N), by = "ItemID_Ref"]
  non_BoE_most_cited <- unique(non_BoE_most_cited[
    order(-n_cit), c("ItemID_Ref",
                     "Nom","Revue_Abbrege","Annee")])
  doublons <- which(duplicated(non_BoE_most_cited$ItemID_Ref))
  
  if(length(doublons) > 0){
    (non_BoE_most_cited <-  non_BoE_most_cited[-doublons])
  }

  non_BoE_most_cited <- head(non_BoE_most_cited, 20)


  BoE_most_cited <- BoE_refs_reduce[between(Year,start_date[i],end_date[i])]
  BoE_most_cited <- BoE_most_cited[,n_cit := .(.N), by = "ItemID_Ref"] 
  BoE_most_cited <- unique(BoE_most_cited[
    order(-n_cit), c("ItemID_Ref",
                     "Nom","Revue_Abbrege","Annee")])
  doublons <- which(duplicated(BoE_most_cited$ItemID_Ref))
  if(length(doublons) > 0){
    (BoE_most_cited <-  BoE_most_cited[-doublons])
  }

  BoE_most_cited <- head(BoE_most_cited, 20)


  Most_cited <- rbind(BoE_most_cited,non_BoE_most_cited)
  Most_cited <- unique(Most_cited[,"ItemID_Ref"])
  
  non_BoE_chi2 <- non_BoE_refs_reduce[
    between(Year,start_date[i],end_date[i]) & ItemID_Ref %in% Most_cited$ItemID_Ref]
  non_BoE_chi2 <- non_BoE_chi2[,n_cit_non_BoE := .(.N), by = "ItemID_Ref"]
  non_BoE_chi2 <- unique(non_BoE_chi2[
    order(-n_cit_non_BoE), c("ItemID_Ref",
                             "Nom","Revue_Abbrege","Annee","n_cit_non_BoE")])
  doublons <- which(duplicated(non_BoE_chi2$ItemID_Ref))
  if(length(doublons) > 0){
    (non_BoE_chi2 <-  non_BoE_chi2[-doublons])
  }

  
  BoE_chi2 <- BoE_refs_reduce[between(Year,start_date[i],end_date[i]) & 
                                ItemID_Ref %in% Most_cited$ItemID_Ref]
  BoE_chi2 <- BoE_chi2[,n_cit_BoE := .(.N), by = "ItemID_Ref"]
  BoE_chi2 <- unique(BoE_chi2[order(-n_cit_BoE), 
                              c("ItemID_Ref",
                                "Nom","Revue_Abbrege","Annee","n_cit_BoE")])
  doublons <- which(duplicated(BoE_chi2$ItemID_Ref))
  if(length(doublons) > 0){
    (BoE_chi2 <-  BoE_chi2[-doublons])
  }
  
  
  chi2_test <- merge(non_BoE_chi2, BoE_chi2, by = "ItemID_Ref", all = TRUE)
  chi2_test[is.na(n_cit_BoE)]$n_cit_BoE <- 0
  chi2_test[is.na(n_cit_non_BoE)]$n_cit_non_BoE <- 0
  chi2_test[is.na(Nom.x)]$Nom.x <- chi2_test[is.na(Nom.x)]$Nom.y
  chi2_test[is.na(Revue_Abbrege.x)]$Revue_Abbrege.x <- chi2_test[
    is.na(Revue_Abbrege.x)]$Revue_Abbrege.y
  chi2_test[is.na(Annee.x)]$Annee.x <- chi2_test[is.na(Annee.x)]$Annee.y
  chi2_test <- chi2_test[,c(1:5,9)]
  colnames(chi2_test)[2:4] <- c("Nom","Revue_Abbrege","Annee")
  
  res_chi <- chisq.test(chi2_test[,list(n_cit_non_BoE,n_cit_BoE)])
  chi2_test$Res_chi <- res_chi$residuals[,2]
  
  chi2_test <- chi2_test[order(-Res_chi)]
  chi2_test$Rank <- 1:length(chi2_test$ItemID_Ref)
  
  chi2_test$Type <- paste0(start_date[i],"-",end_date[i])
  
  Refs_chi2 <- rbind(Refs_chi2, chi2_test)
}

Refs_chi2 <- Refs_chi2[, Nom := gsub("-.*","",Nom)][
  , Label := paste0(Rank,"-",Nom,"-",Annee)]

# plotting the most cited articles for each period, by BoE and non-BoE articles
p <- Refs_chi2%>%
 mutate(Label = reorder_within(Label, Rank, Type)) %>%
 ggplot(aes(x=Label, y=Res_chi, fill=Type)) +
 geom_bar(stat="identity", alpha = .8, show.legend = FALSE) +
facet_wrap(~Type, nrow =  5, scales = "free") +
#coord_flip() +
scale_x_reordered() +
 scale_y_continuous(expand = c(0,0)) +
 theme(strip.text.x = element_text(
    size = 10, face = "bold"),
   axis.text.x = element_text(size = 7, angle = 45,  hjust=1) ) +
 xlab(""
 #"Over- and under-Cited References by BoE Articles in Contrast to other Central Banks"
 ) +
 ylab("Chi-2 residual")



fig_name <-  paste0("FigA", sprintf("%02d", fig_i),
                    "_aChi2_comparison.jpeg")

ggsave(paste0(article_picture_folder,fig_name),
       p +   scale_fill_viridis(discrete = TRUE),
       width=7, height=9.5, units = "in")

ggsave(paste0(article_picture_folder,string_for_BW_print_version, fig_name), 
       p + scale_fill_manual(values = rep("black",times = length(unique(most_cited$Period)))),
        width=7, height=9.5, units = "in")
Figure A11: Comparison between the 20 most cited references by the Bank’s vs. the other central banks’ publications (listed in Web of Science). What is displayed are residuals of a Chi2 test. A negative value means that the reference is under-cited by the Bank’s articles in comparison to citations by other central banks’ publications.

Here is the list of the most cited references by BoE and other CB articles, for each subperiod. The rank column indicates the ranking for the chi2 residuals value (the first article for each subperiod represents the most “over-cited” article by BoE economists).

#kable(unique(Refs_chi2[,c("Nom","Annee","Revue_Abbrege")]), 
# caption = "Most Cited Articles") %>%
#kable_styling(fixed_thead = T) %>%
#scroll_box(width = "800px", height = "360px")

Refs_table <- unique(Refs_chi2[order (Type,Rank),c("Nom","Rank","Annee","Revue_Abbrege","n_cit_BoE","n_cit_non_BoE","Type")])
if(outputFormat == "html"){ # print the datatable conditionnal on html output
  datatable(Refs_table, rownames = FALSE, 
          options =  list(scrollY = "400px",
  scrollCollapse = TRUE,
  scrollX = "300px",
  paging = FALSE))
} else {
  col_width = "8em"
  Refs_table[Nom == "FERNANDEZVILLAVERDE", Nom := "FERNANDEZ VILLAVERDE"]
  Refs_table %>% kable(longtable = T, booktabs = T#, caption = "Longtable"
                       ) %>%
    kable_styling(latex_options = c("repeat_header")) %>%
    column_spec(1, width = col_width) %>% column_spec(4, width = col_width)
}

3.8.2 Citations to central banks

Still using our extraction from Web of Science, we now assess whether BoE publications in external journals tend to be less cited than publications from all other central banks. In the two figures below, the years correspond to the publication date of the cited article. We cover 1979 to 2015, stopping with articles published in 2015 because citations take time to accumulate.

In Figure A12, the two lines give the geometric mean of the number of citations to articles published each year. We see that the Bank of England under-performed the other central banks as late as 2006, although confidence bands show that the difference in means is significant only in the early 1990s. The second figure represents differently the same distributions. When a bar is above zero in a given year, it means that the arithmetic mean or the median is higher for the BoE than for all the other central banks comibined. If it is lower than zero, it means that these statistics are lower for the BoE. Since it is a log scale, the height of a line above or below can be compared directly to assess how far apart the statistics are between the BoE and the other central banks.

# We have to give time for citations to accumulate so we stop a few years
# before our most recent data:
min_y <- 1979
top_y <-  2015

# filling NAs for number of citations
dt_boe_or_not[is.na(Citations), Citations := 0]

# Duplicating ID_Art if it has authors both from BoE and from other central bank
dt_plot <- pivot_longer(dt_boe_or_not,cols = c(is_BoE,is_other1),names_to = "cb") %>% 
  data.table()
dt_plot <- dt_plot[value == TRUE][,value:= NULL]
dt_plot <- merge(dt_plot, 
                 data.table( 
                   cb = c( "is_BoE", "is_other1"),
                   Bank = c("Bank of England", "All other central banks") ), 
                 by = "cb")


# First graph: central tendency of log citations
p <- ggplot(dt_plot[Year >= min_y & Year <= top_y ], 
       aes(x=Year, y = log(Citations), color = Bank)) + geom_smooth() + 
  labs(x = "", y = "Logarithm of citation number"#,
       #title =  "Central tendencies of citations to external articles
       #with author(s) from central banks"
       ) + 
  scale_color_brewer(palette="Set1",direction = -1)


fig_name <-  paste0("FigA", sprintf("%02d", fig_i),
                    "_cit_performance.jpeg")


ggsave(paste0(article_picture_folder,fig_name),
       p )
Figure A12: Central tendencies of citations to external articles with author(s) from central banks.

Figure A13 represents differently the same distributions. When a bar is above zero in a given year, it means that the arithmetic mean or the median is higher for the BoE than for all the other central banks comibined. If it is lower than zero, it means that these statistics are lower for the BoE. Since it is a log scale, the height of a line above or below can be compared directly to assess how far apart the statistics are between the BoE and the other central banks.

# Second graph, looking at mean and median per year
  comp_cit <- dt_plot[Year >=  min_y & Year <= top_y,list(Median=median(Citations),
                                Mean= mean(Citations)
  ),
  by=list(Year,Bank, cb)]
  
  
  # A graph of the log ratio of the two subsample statistics
  setkey(comp_cit,Year,Bank)
  comp_cit_year <-     rbind(comp_cit[
    ,list(dif = log(.SD[cb=="is_BoE",Mean]/.SD[cb!="is_BoE",Mean]),
                                            type = "Logarithm of mean ratio"
  ),by=Year],
  comp_cit[,list(dif = log(.SD[cb=="is_BoE",Median]/.SD[cb!="is_BoE",Median]),
                 type = "Logarithm of median ratio"
  ),by=Year]
  )
  
  p <- ggplot(comp_cit_year, aes(x=Year,y=dif,fill=type)) +
      geom_bar(stat="identity",position="dodge")  + theme_bw() +
      #scale_fill_manual(values = c("grey70", "grey30"))  + 
      labs(x="", y="Logarithmic scale",
           fill="Measure comparing citations\nfor the two subsamples") #+
    # theme(axis.text = element_text(size=18)
    #      ,axis.title = element_text(size=20)
    #       , legend.title = element_text(size=20),
    #       legend.text = element_text(size=14)) 
  
#  p + 
#    ggtitle("Citations to external articles with author(s) from central banks.",     
#      subtitle = "A bar above the origin signals an advantage to the Bank of England.")
 

fig_name <-  paste0("FigA", sprintf("%02d", fig_i),
                    "_cit_performance2.jpeg")


ggsave(paste0(article_picture_folder,fig_name),
       p )
Figure A13: Citations to external articles with author(s) from central banks. A bar above the origin signals an advantage to the Bank of England.

3.9 Evolution of topics through time

We trained a topic model on the raw text of the BoE research documents using the implementation of the Latent Dirichlet Allocation (LDA) in the Python package sklearn. A LDA requires that the number of topics be manually selected. After inspecting results for different values of this parameter, we decided that 50 topics gave a model that was easy to interpret. Only 3 topics were meaningless and have been excluded.

Topics were named manually based on the background knowledge of our team: for each topic, we inspected the top 50 words and the most characteristic documents. After this first interpretation, we have grouped similar topics into 8 “types,” which can be taken as “meta-topics”. We excluded the 3 meaningless topics plus one topic on game theory, which was very isolated. The 46 remaining topics are allocated between the different types (see the table below for the complete list of topics and topic types).

The first table associates the 47 meaningful topics with their topic type and their first 15 top words.

## Identity of topics and meta-topics ##

# extracting n top words for each topic
n_words = 15
dt_topwords_per_topics <- data.table(
  Topic_number = Words_topics[,1],
  Keywords = apply(Words_topics[,-1], 
                   MARGIN = 1, 
                   function(x) paste(x[1:n_words], collapse = ", "))
  )

Types_topics <-   merge( Types_topics, dt_topwords_per_topics, by = "Topic_number")


# topic numbers of residual topics
Residuals <- Types_topics[Topic_type == "residual"]$Topic_number

# having topic list

Topics_list <- Types_topics[Topic_number != c(Residuals),]
Topics_list <- Topics_list[,c("Topic_type","Topic_name", "Keywords")]
Topics_list <- Topics_list[order(Topic_type,Topic_name)]

setnames(Topics_list, "Keywords", paste("Top", n_words, "words"))

# Table of topics list
if(outputFormat == "html"){ # print the datatable conditionnal on html output
  datatable(Topics_list, rownames = FALSE, 
          options =  list(scrollY = "400px",
  scrollCollapse = TRUE,
  scrollX = "300px",
  paging = FALSE))
} else{
   col_width = "15em"
  Topics_list %>%  kable(longtable = T, booktabs = T#, caption = "Longtable"
                         ) %>%
    kable_styling(latex_options = c("repeat_header", "striped")) %>%
    column_spec(1:3, width = col_width)# %>% column_spec(3, width = col_width)
}

The next table displays the 3 main topics for each period of two years, and their corresponding topic types.

# finding the n most representative topics for each period of two years
main_topics_period <- Period_topics[topic != c(Residuals),] #removing residuals
main_topics_period <- reshape2::melt(main_topics_period, id.vars = "topic")
main_topics_period <- main_topics_period[order(variable,-value),] 
main_topics_period <- main_topics_period %>%
  group_by(variable) %>%
  slice(1:3) %>%
  as.data.table()

# merging the main topics with the topics name

topics <- merge(Types_topics[, c("Topic_number","Topic_name","Topic_type")],
                main_topics_period, by.y = "topic", by.x = "Topic_number")

# Simplifying the name of one theme (needed for consistency with second plotting)
topics[Topic_type == "Banking and other financial institutions supervision"
       ]$Topic_type <- "Financial institutions supervision"

# Table with resulting data

topics_table <- unique(topics[,c("Topic_name","Topic_type","variable","value")])
colnames(topics_table)[3] <- "Period" 
topics_table <- topics_table[order(Period,-value)]

if(outputFormat == "html"){ # print the datatable conditionnal on html output
  datatable(topics_table, rownames = FALSE, 
          options =  list(scrollY = "400px",
  scrollCollapse = TRUE,
  scrollX = "300px",
  paging = FALSE))
} else{
   col_width = "10em"
  topics_table %>%  kable(longtable = T, booktabs = T#, caption = "Main Topics"
                          ) %>%
    kable_styling(latex_options = c("repeat_header", "striped")) %>%
     column_spec(1:2, width = col_width)
}
# plotting the top n topic title by period

#topics %>%
#  filter(variable != "(1972, 1973)" & variable != "(1974, 1975)" & 
#  variable!="(1976, 1977)" & Topic_type != "Other") %>%
#ggplot(aes(x = variable, y = value, group = Topic_type)) +
#  geom_label_repel(aes(label = Topic_name, fill = Topic_type), size = 2.5, alpha = 0.7) +
#  theme(legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 1)) +
#  coord_flip() + 
#  ggsave(paste0(picture_path,"Main_topics.png"), width=400, height=215, units = "mm")

The last figure (included in the main article) presents the share of each meta-topic over time.

## Evolution of themes through time
Period_topics_by_row <- reshape2::melt(Period_topics, id.vars = "topic")
themes <- merge(Period_topics_by_row, Types_topics[
  Topic_type != "residual",c("Topic_number","Topic_type")],
  by.x = "topic", by.y = "Topic_number")

themes$value <- gsub(",",".",themes$value)
themes$value <- as.numeric(themes$value)

themes <- themes[, sum_values := sum(value), by = c("Topic_type","variable")]
themes <- unique(themes[, c("variable","Topic_type","sum_values")])
themes <- themes[, percentage := sum(sum_values), by = "variable"]
themes <- themes[, percentage := sum_values/percentage]

# simplifying the name of one theme
themes[Topic_type == "Banking and other financial institutions supervision"
       ]$Topic_type <- "Financial institutions supervision"


# New plot
# Reformat a bit the themes for them to fit better on graph
themes <- merge(themes, data.table(Topic_type = c(
  "Financial institutions supervision",
  "Financial Markets",
  "Other",
  "Real economy / Structural Analysis",
  "Conjunctural Analysis / Business Cycle",
  "International economics",
  "Econometrics and data analysis",
  "Monetary Policy and operationnalisation",
  "Inflation analysis"
  ), Topic = c(
  "Financial Supervision",
  "Financial Markets",
  "Other",
  "Real Economy &\nStructural Analysis",
  "Business Cycle &\nConjunctural Analysis",
  "International Economics",
  "Econometrics &\nData Analysis",
  "Monetary Policy &\nFramework",
  "Inflation Analysis"
  )
), 
by = "Topic_type" )

# Ordering the topics to highlight some patterns

themes$Topic <- factor(themes$Topic,
                            levels = 
                               c(
  "Financial Supervision",
  "Financial Markets",
  "Other",
  "Real Economy &\nStructural Analysis",
  "International Economics",
  "Monetary Policy &\nFramework",
  "Inflation Analysis",
  "Business Cycle &\nConjunctural Analysis",
  "Econometrics &\nData Analysis"
  )
)


# Creating a fake year column which takes only the end of the two year interval
themes[ , Year :=variable %>% as.character %>% strsplit(., ",") %>% 
          lapply(., function(x) as.numeric(gsub("([0-9]+).*$", "\\1", x[2]))) %>% 
          unlist()]

# Tweak to get the labels at the end of plot 
# (like with geom_dl, which doesn't work with geom_area)
end_themes <-  themes[Year == last(Year)]
end_themes <- end_themes[order(Topic,levels(Topic),decreasing = TRUE)]
end_themes[, cum := cumsum(percentage)]
end_themes[, y_pos_lab := rollmean(c(0,cum),k = 2)]

p <- themes[variable != c("(1972, 1973)","(1974, 1975)","(1976, 1977)")] %>%  
  ggplot(aes(x = Year, y = percentage, group = Topic, fill = Topic)) + 
  geom_area(alpha=0.8) + theme_classic() +
  theme(axis.text = element_text(size = size_x_and_y),
        legend.position="none", 
       # axis.text.x = element_text(angle = 45, hjust = 1),
        plot.margin = margin(0.1,4,0.1, 0.1,  "cm"),
        title = element_text(size = size_x_and_y*title_boost_factor),
        plot.subtitle = element_text(size = size_x_and_y),
        legend.text = element_text(size=size_x_and_y),
        legend.title = element_text(size=size_x_and_y*title_boost_factor)) +
  labs(y= "Cumulative proportion of research output", fill = "Topic", x = ""
   #, title = "Size of the General Topics in the Research Output\nof the Bank of England"
       ) +
  coord_cartesian(clip = 'off') + 
  geom_label(data = end_themes, 
             aes(x = Year, y = y_pos_lab, label = Topic ,alpha=.7), 
             size = 4, hjust=0, nudge_x = 0.1)

fig_name <- "Fig02_Themes_over_time.jpeg"

shuffling_index <-  vector(mode="numeric")
for(i in 1:5){
  shuffling_index <- append(shuffling_index, c(i,i+5))
}   

viridis_shuffled <-    viridis(n=10)[shuffling_index]  

ggsave(paste0(article_picture_folder,fig_name), 
       p + scale_fill_manual(
         values = viridis_shuffled)  )

greys_shuffled <- grey(2:10/10)[shuffling_index] 


ggsave(paste0(article_picture_folder,string_for_BW_print_version, fig_name), 
       p + scale_fill_manual(values = greys_shuffled)       
       )
Figure 2: Proportion of in-house research being cited at least once in speeches. Curves are smoothed using local polynomial regression.