plotting_upset_diagrams.Rmd
UpSet diagrams visualizes complex intersections where the columns represent different combinations of factors across different sets of groups (rows). They are very helpful when looking at the different combinations of AMR determinants that may be responsible for resistance.
First get the relevant data sets
library(ghruR)
library(kableExtra)
epi_data <- ghruR::get_data_for_country(
country_value = "India",
type_value = "Epidemiological Metadata",
user_email = "anthony.underwood@cgps.group")
## [1] "anthony.underwood@cgps.group"
## [1] "anthony.underwood@cgps.group"
## [1] "anthony.underwood@cgps.group"
epi_data <- ghruR::clean_data(epi_data)
ast_data <- ghruR::get_data_for_country(
country_value = "India",
type_value = "Antimicrobial Susceptibility Testing",
user_email = "anthony.underwood@cgps.group")
## [1] "anthony.underwood@cgps.group"
acquired_amr_data <- ghruR::get_data_for_country(
country_value = "India",
type_value = "AMR Klebsiella pneumoniae",
AMR_type = "acquired",
user_email = "anthony.underwood@cgps.group")
## [1] "anthony.underwood@cgps.group"
point_amr_data <- ghruR::get_data_for_country(
country_value = "India",
type_value = "AMR Klebsiella pneumoniae",
AMR_type = "point",
user_email = "anthony.underwood@cgps.group")
## [1] "anthony.underwood@cgps.group"
mlst_data <- ghruR::get_data_for_country(
country_value = "India",
type_value = "MLST",
user_email = "anthony.underwood@cgps.group")
## [1] "anthony.underwood@cgps.group"
Convert the acquired AMR data to long format, annotate it and get the different data sets
# convert to long format
annotated_amr_data <- ghruR::annotate_amr_data(acquired_amr_data)
# filter data for those that have acquired the gene and have good pct_id match and ctg_cov
acquired_genes_present <- ghruR::filter_long_data(annotated_amr_data)
# filter by subclass
carbapenamases <- acquired_genes_present %>%
dplyr::filter(subclass == "CARBAPENEM") %>%
dplyr::select(`Sample id`, gene) %>%
dplyr::rename(determinant = gene)
head(carbapenamases) %>% kable() %>% kable_styling() %>% scroll_box(width = "100%")
Sample id | determinant |
---|---|
G18250048 | blaNDM-1 |
G18250048 | blaOXA-181 |
G18250048 | blaOXA-181 |
G18250088 | blaNDM-1 |
G18250088 | blaOXA-232 |
G18250088 | blaOXA-232 |
esbls <- acquired_genes_present %>%
dplyr::filter(subclass == "CEPHALOSPORIN") %>%
dplyr::select(`Sample id`, gene) %>%
dplyr::rename(determinant = gene) %>%
dplyr::filter(
str_starts(determinant, "blaCTX-M") |
str_starts(determinant, "blaSHV") |
str_starts(determinant, "blaTEM") |
str_starts(determinant, "blaOXA")
)
head(esbls) %>% kable() %>% kable_styling() %>% scroll_box(width = "100%")
Sample id | determinant |
---|---|
G18250048 | blaCTX-M-15 |
G18250088 | blaCTX-M-15 |
G18250088 | blaOXA-1 |
G18250103 | blaCTX-M-15 |
G18250103 | blaOXA-1 |
G18250114 | blaCTX-M-15 |
Now handling point data, first get the mutations in assembled genes
annotated_point_amr_data <- ghruR::get_annotated_point_amr_data(point_amr_data, 'klebsiella', annotation_type = 'mutations')
# get mutations specific for carbapenems
carbapenam_specific_mutations <- annotated_point_amr_data %>%
dplyr::filter(resistance == "carbapenem") %>%
select(-resistance)
head(carbapenam_specific_mutations) %>% kable() %>% kable_styling() %>% scroll_box(width = "100%")
Sample id | determinant |
---|---|
G18250048 | ompK36_A217S |
G18250048 | ompK37_E244D |
G18250048 | ompK37_I128M |
G18250048 | ompK37_I70M |
G18250048 | ompK37_N230G |
G18250088 | ompK36_A217S |
Now get the genes which were incomplete (partial or fragmented) and therefore possibly non-functional
annotated_point_amr_data <- ghruR::get_annotated_point_amr_data(point_amr_data, 'klebsiella', annotation_type = 'incomplete')
# get mutations specific for carbapenems
carbapenam_specific_incomplete_genes <- annotated_point_amr_data %>%
dplyr::filter(resistance == "carbapenem") %>%
select(-resistance)
head(carbapenam_specific_incomplete_genes) %>% kable() %>% kable_styling() %>% scroll_box(width = "100%")
Sample id | determinant |
---|---|
G18251889 | ompK36-absent |
G18252053 | ompK36-absent |
G18252054 | ompK36-partial |
G18253166 | ompK36-partial |
G18253170 | ompK36-absent |
G18253171 | ompK36-partial |
For simplicity in the plots convert both of these to be either gene-mutation
or gene-incomplete
carbapenam_specific_mutations %<>% dplyr::mutate(determinant = stringr::str_replace(determinant, "_.+$", "-mutation")) %>%
dplyr::distinct(`Sample id`, determinant)
head(carbapenam_specific_mutations) %>% kable() %>% kable_styling() %>% scroll_box(width = "100%")
Sample id | determinant |
---|---|
G18250048 | ompK36-mutation |
G18250048 | ompK37-mutation |
G18250088 | ompK36-mutation |
G18250088 | ompK37-mutation |
G18250103 | ompK36-mutation |
G18250103 | ompK37-mutation |
carbapenam_specific_incomplete_genes %<>% dplyr::mutate(determinant = stringr::str_replace(determinant, "-.+$", "-incomplete")) %>%
dplyr::distinct(`Sample id`, determinant)
head(carbapenam_specific_incomplete_genes) %>% kable() %>% kable_styling() %>% scroll_box(width = "100%")
Sample id | determinant |
---|---|
G18251889 | ompK36-incomplete |
G18252053 | ompK36-incomplete |
G18252054 | ompK36-incomplete |
G18253166 | ompK36-incomplete |
G18253170 | ompK36-incomplete |
G18253171 | ompK36-incomplete |
Produce a final dataframe from the acquired and point dataframes with determinants putatively involved in carbapenem resistance
This involves binding the rows and combining multiple determinants for the same sample into one by joining them into a list (for the ggupset function) using the group_by
and summarise
functions from the the dplyr
package
#combine all determinants
all_potential_CR_determinants <-
rbind(carbapenamases, esbls, carbapenam_specific_mutations, carbapenam_specific_incomplete_genes)
# merge multiple instances
all_potential_CR_determinants %<>% dplyr::group_by(`Sample id`) %>%
dplyr::summarise(determinant = list(determinant))
head(all_potential_CR_determinants) %>% kable() %>% kable_styling() %>% scroll_box(width = "100%")
Sample id | determinant |
---|---|
G18250048 | blaNDM-1 , blaOXA-181 , blaOXA-181 , blaCTX-M-15 , ompK36-mutation, ompK37-mutation |
G18250088 | blaNDM-1 , blaOXA-232 , blaOXA-232 , blaCTX-M-15 , blaOXA-1 , ompK36-mutation, ompK37-mutation |
G18250103 | blaNDM-1 , blaOXA-232 , blaOXA-232 , blaCTX-M-15 , blaOXA-1 , ompK36-mutation, ompK37-mutation |
G18250104 | ompK36-mutation, ompK37-mutation |
G18250114 | blaNDM-1 , blaCTX-M-15 , blaOXA-1 , ompK36-mutation, ompK37-mutation |
G18250122 | blaNDM-1 , blaCTX-M-15 , blaOXA-1 , ompK36-mutation, ompK37-mutation |
all_potential_CR_determinants %<>%
dplyr::filter(str_detect(determinant, "bla"))
# filter out those with only ESBL but no OmpK mutations
all_potential_CR_determinants %<>%
dplyr::filter(!
(
(
str_detect(determinant, "blaCTX-M") |
str_detect(determinant, "blaSHV") |
str_detect(determinant, "blaTEM") |
str_detect(determinant, "blaOXA")
) &
!str_detect(determinant, "ompK")
)
)
head(all_potential_CR_determinants) %>% kable() %>% kable_styling() %>% scroll_box(width = "100%")
Sample id | determinant |
---|---|
G18250048 | blaNDM-1 , blaOXA-181 , blaOXA-181 , blaCTX-M-15 , ompK36-mutation, ompK37-mutation |
G18250088 | blaNDM-1 , blaOXA-232 , blaOXA-232 , blaCTX-M-15 , blaOXA-1 , ompK36-mutation, ompK37-mutation |
G18250103 | blaNDM-1 , blaOXA-232 , blaOXA-232 , blaCTX-M-15 , blaOXA-1 , ompK36-mutation, ompK37-mutation |
G18250114 | blaNDM-1 , blaCTX-M-15 , blaOXA-1 , ompK36-mutation, ompK37-mutation |
G18250122 | blaNDM-1 , blaCTX-M-15 , blaOXA-1 , ompK36-mutation, ompK37-mutation |
G18251889 | blaOXA-232 , blaOXA-232 , blaCTX-M-15 , ompK37-mutation , ompK36-incomplete |
Use the ggupset library to plot the data
library(ggupset)
# Find unique determinants to become the set labels
unique_determinants <- rbind(carbapenamases, esbls, carbapenam_specific_mutations, carbapenam_specific_incomplete_genes) %>%
dplyr::distinct(determinant) %>%
dplyr::arrange(determinant) %>%
dplyr::pull(determinant)
# make the upset plot
upset_plot <- all_potential_CR_determinants %>%
ggplot(aes(x=determinant)) +
geom_bar() +
scale_x_upset(order_by = "freq", sets = unique_determinants)
upset_plot
library(cowplot)
library(forcats)
all_potential_CR_determinants_and_ST <- all_potential_CR_determinants %>%
dplyr::left_join(mlst_data %>% dplyr::select(`Sample id`, ST), by = "Sample id")
num <- 9
most_frequent_STs <- all_potential_CR_determinants_and_ST %>%
dplyr::select(ST) %>% dplyr::count(ST) %>%
dplyr::arrange(desc(n)) %>% dplyr::slice_max(n, n = num) %>%
dplyr::pull(ST)
plots = list()
plot_index <- 0
for (start in seq(1,9, by =3)){
plot_index <- plot_index + 1
end <- start + 2
selected_STs <- most_frequent_STs[start:end]
data_for_STs <- all_potential_CR_determinants_and_ST %>% dplyr::filter(ST %in% selected_STs)
determinants_for_STs <- data_for_STs %>%
unnest(cols = c(determinant)) %>%
dplyr::distinct(determinant) %>%
dplyr::arrange(determinant) %>%
dplyr::pull(determinant)
upset_plot_for_STs <- data_for_STs %>%
mutate(ST = fct_relevel(ST, selected_STs)) %>%
ggplot(aes(x = determinant)) +
geom_bar() +
theme(plot.title = element_text(size = 20, face = "bold")) +
facet_grid(cols = vars(ST), scales = "free_x", space = "free", drop = FALSE) +
ggeasy::easy_labs(x = "") +
scale_x_upset(order_by = "freq", sets = determinants_for_STs) +
theme(panel.spacing.x=unit(6, "lines"))
plots[[plot_index]] <- upset_plot_for_STs
}
plot_grid <- cowplot::plot_grid(plotlist = plots, align = "h", ncol = 1) +
cowplot::draw_figure_label(label = "determinant", position = "bottom", font = "bold")
plot_grid