Analysis of MPN Data
MPN_SIGURD.Rmd
Loading necessary packages.
suppressPackageStartupMessages(library(sigurd))
suppressPackageStartupMessages(library(SummarizedExperiment))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(ComplexHeatmap))
suppressPackageStartupMessages(library(circlize))
suppressPackageStartupMessages(library(grid))
suppressPackageStartupMessages(library(EnhancedVolcano))
Analysis of MPN data
Here, we illustrate how to analysis genotyping data from MPN samples. The JAK2V617F mutation was genotyped for 6 MPN samples and 3 healthy controls.
This setup makes this analysis different from the previous analyses. Previously, only one sample was analysed at a time. Now, we load and analyse the samples simultaneously.
Loading MPN data
Now, we load the JAK2V617F data and plot the mutated cells on the UMAP.
Since we are loading multiple samples, this code is more complicated than the code for the MAESTER SW sample.
# Loading the scRNA-seq data.
scrna <- load_object("/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/scrna_MPN.rds.lz4")
# Loading the genotyping data.
genotyping <- LoadingVarTrix_typewise(patient = "MPN", samples_file = "/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/DesignMatrix_VarTrix.csv", min_reads = 0, vcf_path = "/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/CosmicSubset_JAK2V617F.vcf", type_use = "scRNAseq_Somatic", min_cells = 0, verbose = FALSE)
amplicon <- LoadingVarTrix_typewise(patient = "MPN", samples_file = "/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/DesignMatrix_Amplicon.csv", min_reads = 0, vcf_path = "/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/CosmicSubset_UCSC_JAK2V617F.vcf", type_use = "scRNAseq_Amplicon", min_cells = 0, verbose = FALSE)
genotyping <- AmpliconSupplementing(scRNAseq = genotyping, amplicon = amplicon, verbose = FALSE)
# Filtering the genotyping object to remove false positives.
genotyping <- Filtering(genotyping, min_cells_per_variant = 0, fraction_threshold = 0.21, cells_include = colnames(scrna), min_variants_per_cell = 1, reject_value = "NoCall", verbose = FALSE)
Combining Seurat and SIGURD
Now, we add the genotyping information to the Seurat object.
scrna <- SetVariantInfo(SE = genotyping, seurat_object = scrna)
## [1] "You did not supply a vector of variants. All variants will be used."
Visualisation
We visualise the genotyping information on the UMAP.
# Plotting the mutated cells on the UMAP.
scrna$JAK2_p.V617F_c.1849G.T_consensus <- factor(scrna$JAK2_p.V617F_c.1849G.T_consensus, levels = c("Ref", "Alt"))
jak2_cells <- sum(scrna$JAK2_p.V617F_c.1849G.T_consensus == "Alt", na.rm = TRUE)
ref_cells <- sum(scrna$JAK2_p.V617F_c.1849G.T_consensus == "Ref", na.rm = TRUE)
all_cells <- ncol(scrna)
p <- DimPlot(scrna, group.by = "JAK2_p.V617F_c.1849G.T_consensus", reduction = "INTE_UMAP", order = TRUE, pt.size = 1, raster = FALSE, na.value = "grey95", cols = c(Alt = "#aa0051", Ref = "#79b938")) +
ggtitle(paste0("Alt: ", jak2_cells, ", Ref: ", ref_cells, ", All: ", all_cells)) +
xlab("UMAP 1") + ylab("UMAP 2") + scale_color_manual(breaks = c("Alt", "Ref"), values = c("#aa0051", "#79b938"), na.value = "grey95") +
theme(legend.position = "top")
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
print(p)
We determine DEGs between the JAK2V617F and WT cells.
Now, we determine the DEGs between the JAK2V617F cells and the wild type cells for the Orthochromatic Erythroblasts.
scrna_orthoe <- subset(scrna, CellType == "Orthochromatic Erythroblasts" & Condition != "HC")
degs <- FindMarkers(scrna_orthoe, group.by = "JAK2_p.V617F_c.1849G.T_consensus", ident.1 = "Alt", ident.2 = "Ref", logfc.threshold = 0)
degs <- data.frame(degs, gene = rownames(degs))
degs_up <- nrow(subset(degs, avg_log2FC > 0.25 & p_val_adj < 0.05))
degs_down <- nrow(subset(degs, avg_log2FC < -0.25 & p_val_adj < 0.05))
EnhancedVolcano(degs, x = "avg_log2FC", y = "p_val_adj", lab = degs$gene, FCcutoff = 0.25, pCutoff = 0.05, title = "OrthoE JAK2V617F Pos. vs. Neg.",subtitle = NULL)
Combining the JAK2V617F data with the MT data.
We combine the JAK2V617F SIGURD object with the MT object. Cells and variants in both objects are combined.
To save time, we are only processing a single sample. All other samples are processed equally. Since the genotyping object is a list, it is easily processed using parallel computing.
First, we load and filter the mitochondrial variant information.
# Loading the data.
genotyping_mt <- LoadingMAEGATK_typewise(patient = "MPN1", samples_file = "/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/DesignMatrix_MT.csv", min_cells = 0, cells_include = colnames(scrna), type_use = "scRNAseq_MT", verbose = FALSE)
genotyping_mt <- Filtering(genotyping_mt, min_variants_per_cell = 1, min_cells_per_variant = 2, fraction_threshold = 0.05, cells_include = colnames(scrna), verbose = FALSE)
MPN VOIs and Diversity
Now, we select the mtVOIs. We then determine the clonal lineages and plot the results on a heatmap.
We subset the cells to only include the erythroid cells.
# Subset to erythroid cells.
erythroid_cells <- subset(scrna, annotation == "ErythroidCells")
genotyping_mt <- Filtering(genotyping_mt, cells_include = colnames(erythroid_cells))
## [1] "We remove all cells not in the allow list."
## [1] "We remove all the variants that are always NoCall."
# Selecting the mtVOIS.
mt_vois <- VariantSelection_Quantile(genotyping_mt, min_coverage = 1, verbose = FALSE)
# Determining the clonal lineages.
genotyping_mt <- ClonalDefinition(se = genotyping_mt, variants_ls = list(mt_vois), verbose = FALSE)
Visualisation of mtVOIs and Clonal Lineages
We now plot the variants of interest and the clonal lineages.
# Generating a heatmap.
colors_lineage <- c("C1" = "#563e00", "C2" = "#dec234", "C3" = "#e23856", "C4" = "#7f0073", "C5" = "#006639", "C6" = "#b1db8e", "C7" = "#00348f", "Negative" = "black")
ha <- columnAnnotation(Clones = colData(genotyping_mt)[,"Clones"],
col = list(Clones = colors_lineage),
annotation_name_gp = gpar(fontsize = 10, fontface = "plain"),
annotation_legend_param = list(Clones = list(nrow = 1, title = "")))
fraction <- as.matrix(assays(genotyping_mt)[["fraction"]])[mt_vois, ]
hm <- Heatmap(fraction,
name = "VAF",
row_title_gp = gpar(fontsize = 12),
row_names_gp = gpar(fontsize = 10),
row_names_max_width = unit(16, "cm"),
column_title = "MPN1",
column_title_gp = gpar(fontsize = 12),
column_names_gp = gpar(fontsize = 10),
col = colorRamp2(seq(0, round(max(fraction, na.rm = TRUE)), length.out = 9), c("#FCFCFC", "#FFEDB0", "#FFDF5F", "#FEC510", "#FA8E24", "#F14C2B", "#DA2828", "#BE2222", "#A31D1D")),
show_column_dend = FALSE,
show_row_names = TRUE,
show_column_names = FALSE,
cluster_columns = TRUE,
cluster_rows = FALSE,
heatmap_legend_param = list(border = "#000000", grid_height = unit(10, "mm"),
direction = "horizontal"),
bottom_annotation = ha,
border = TRUE,
use_raster = TRUE,
show_heatmap_legend = TRUE)
draw(hm, annotation_legend_side = "bottom")
Calculating clonal diversity.
We calculate the clonal diversity for the sample MPN1 as an example.
clonal_diversity <- ClonalDiversity(genotyping_mt, grouping = "Clones", diversity_measure = "EffectiveSpecies", verbose = FALSE)
print(paste0("Clonal Diversity in Effective Number of Species for MPN1: ", clonal_diversity))
## [1] "Clonal Diversity in Effective Number of Species for MPN1: 6.72545022391521"
Repeating the MT analysis for all samples.
We now repeat the MT analysis for all samples. This will take considereable time.
We speed the analsyis up by using parallel computing. We load the results generated by this code.
# We get all patients present in the design matrix.
design_matrix <- read.table("/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/DesignMatrix_MT.csv", header = TRUE, sep = ",")
patients <- design_matrix$patient[1]
genotyping_mt_all <- parallel::mclapply(patients, LoadingMAEGATK_typewise, samples_file = "/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/DesignMatrix_MT.csv", min_cells = 0, cells_include = colnames(scrna), type_use = "scRNAseq_MT", verbose = FALSE, mc.cores = length(patients))
names(genotyping_mt_all) <- patients
genotyping_mt_all <- parallel::mclapply(genotyping_mt_all, Filtering, min_variants_per_cell = 1, min_cells_per_variant = 2, fraction_threshold = 0.05, cells_include = colnames(scrna), verbose = FALSE, mc.cores = length(patients))
Selecting variants of interest for all samples
We select variants of interest for all samples.
# Selecting the mtVOIS.
mt_vois_all <- parallel::mclapply(genotyping_mt_all, VariantSelection_Quantile, min_coverage = 1, verbose = FALSE, mc.cores = length(patients))
# Determining the clonal lineages. This takes very long. We use parallel computing to process all samples simultaneously.
genotyping_mt_all <- parallel::mclapply(1:length(mt_vois_all), function(x){
sample_use <- names(mt_vois_all)[x]
clones <- sigurd::ClonalDefinition(se = genotyping_mt_all[[sample_use]], mt_vois_all[sample_use], verbose = FALSE)
return(clones)
}, mc.cores = length(mt_vois_all))
names(genotyping_mt_all) <- patients
save_object(genotyping_mt_all, "/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/genotyping_mt_all.rds.lz4", "lz4")
Calculating the diverstiy for all samples
# We load the previous results.
genotyping_mt_all <- load_object("/data/MPN/exp/scRNA/MPN_mutations/SIGURD_paper/sigurd/data/genotyping_mt_all.rds.lz4")
clonal_diversity_all <- unlist(lapply(genotyping_mt_all, ClonalDiversity, grouping = "Clones", diversity_measure = "EffectiveSpecies", verbose = FALSE))
clonal_diversity_all <- data.frame(Sample = names(genotyping_mt_all), EffectiveSpecies = clonal_diversity_all)
knitr::kable(clonal_diversity_all, format="html")
Sample | EffectiveSpecies | |
---|---|---|
MPN1 | MPN1 | 6.556187 |
MPN2 | MPN2 | 35.941119 |
MPN3 | MPN3 | 24.329734 |
HC1 | HC1 | 62.496136 |
HC2 | HC2 | 494.903187 |
HC3 | HC3 | 237.440220 |
MPN6 | MPN6 | 11.016631 |
MPN4 | MPN4 | 14.843004 |
MPN5 | MPN5 | 42.984619 |