Figure 6 - Rare Variant

Author

Michael Gandal

Published

January 19, 2023

Load Packages and Data

source('code/fisher_overlap.R')

suppressPackageStartupMessages({
  library(rtracklayer)
  library(GenomicRanges)
  library(edgeR)
  library(WGCNA)
  library(biomaRt)
  library(parallel)
  library(tidyverse)
  library(tidytext)
  library(ggh4x)
})

colorVector = c(
  "Known" = "#009E73",
  "ISM"   = "#0072B2",
  "ISM_Prefix" = "#005996",
  "ISM_Suffix" = "#378bcc",
  "NIC"   = "#D55E00",
  "NNC"   = "#E69F00",
  "Other" = "#000000"
)
colorVector_ismSplit = colorVector[-2]



# Load bulk transcriptome data, see analysis/Figure1_BulkTxomeAnalysis.qmd
load("data/working/bulkTxome.Rdata")


# Load DGE/DTE/DTU sumstats
tableS3 = read_tsv("output/tables/TableS3_v3.tsv.gz")
Rows: 102319 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (5): isoform_id, gene_id, gene_name, condition_1, condition_2
dbl (9): DTU_dIF, DTU_pval, DTU_qval, DTE_log2FC, DTE_pval, DTE_qval, DGE_lo...
lgl (3): DTU, DTE, DGE

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tableS3$ensg = substr(tableS3$gene_id,1,15)

# Load networks
tidyMods = read_tsv(file='output/tables/TableS4A_networks.tsv')
Rows: 209812 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): network, transcript_id, module.color, module, gene_id, gene_name, ensg
dbl (3): module.number, kME, hub.rank
lgl (1): hub

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Load Rare Variant datasets

geneAnno = rtracklayer::import("ref/gencode.v33lift37.annotation.gtf.gz") %>% as_tibble() %>% filter(type=='gene')
geneAnno$ensg = substr(geneAnno$gene_id,1,15)
geneAnno = geneAnno[match(unique(geneAnno$ensg), geneAnno$ensg),]

#Binary vector 0 for all protein coding genes, NA for non-coding; will replace with 1 for rare variant disease risk gene
protein_coding_bg = rep(NA, times=nrow(geneAnno))
protein_coding_bg[geneAnno$gene_type=="protein_coding"] = 0
names(protein_coding_bg) = geneAnno$ensg


#Start with compiled list of risk genes from SFARI, Kaplanis,
risk_genes =read.csv("ref/ASD+SCZ+DDD_2022.csv")
rareVar.binary =data.frame(ASD.SFARI.1 = protein_coding_bg)
rownames(rareVar.binary)= geneAnno$ensg
rareVar.binary$ASD.SFARI.1[geneAnno$gene_name %in% risk_genes$Gene[risk_genes$Set=="ASD (SFARI score 1)"]] = 1
rareVar.binary$ASD.SFARI.1or2 = protein_coding_bg
rareVar.binary$ASD.SFARI.1or2[geneAnno$gene_name %in% risk_genes$Gene[risk_genes$Set=="ASD (SFARI score 1or2)"]] = 1
rareVar.binary$ASD.SFARI.S = protein_coding_bg
rareVar.binary$ASD.SFARI.S[geneAnno$gene_name %in% risk_genes$Gene[risk_genes$Set=="ASD (SFARI syndromic)"]] = 1
rareVar.binary$DDD.kaplanis = protein_coding_bg
rareVar.binary$DDD.kaplanis[geneAnno$gene_name %in% risk_genes$Gene[risk_genes$Set=="DDD (Kaplanis et al. 2019)"]] = 1

epilepsy_helbig = openxlsx::read.xlsx('http://epilepsygenetics.net/wp-content/uploads/2023/01/Channelopathist_genes_internal_2023_v2.xlsx')
rareVar.binary$EPI.helbig = protein_coding_bg
rareVar.binary$EPI.helbig[geneAnno$gene_name %in% epilepsy_helbig$Gene] = 1


# Fu et al., Nat Genetics 2022 -- ASD, NDD TADA
fu=openxlsx::read.xlsx(('https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-022-01104-0/MediaObjects/41588_2022_1104_MOESM3_ESM.xlsx'),'Supplementary Table 11')
fu$p_TADA_ASD[fu$p_TADA_ASD==0] = min(fu$p_TADA_ASD[fu$p_TADA_ASD >0],na.rm=T)
fu$p_TADA_NDD[fu$p_TADA_NDD==0] = min(fu$p_TADA_NDD[fu$p_TADA_NDD >0],na.rm=T)

rareVar.logit = rareVar.binary
rareVar.logit$ASD.fuTADA= -log10(fu$p_TADA_ASD)[match(geneAnno$ensg, fu$gene_id)]
rareVar.binary$ASD.fuTADA= as.numeric(fu$FDR_TADA_ASD[match(geneAnno$ensg, fu$gene_id)] < .1)
rareVar.logit$NDD.fuTADA = -log10(fu$p_TADA_NDD)[match(geneAnno$ensg, fu$gene_id)]
rareVar.binary$NDD.fuTADA = as.numeric(fu$FDR_TADA_NDD[match(geneAnno$ensg, fu$gene_id)] < .1)

SCZ.schema = read_tsv('ref/risk_genes/SCHEMA_gene_results.tsv')
Warning: One or more parsing issues, see `problems()` for details
Rows: 18321 Columns: 26
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr  (8): gene_id, group, OR (PTV), OR (Class I), OR (Class II), OR (PTV) up...
dbl (16): Case PTV, Ctrl PTV, Case mis3, Ctrl mis3, Case mis2, Ctrl mis2, P ...
lgl  (2): De novo mis3, De novo mis2

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
rareVar.logit$SCZ.schema = -log10(SCZ.schema$`P meta`[match(geneAnno$ensg, SCZ.schema$gene_id)])
rareVar.binary$SCZ.schema = as.numeric(SCZ.schema$`Q meta`[match(geneAnno$ensg, SCZ.schema$gene_id)] < .1)

BIP.bipex = read_tsv('ref/risk_genes/BipEx_gene_results.tsv') %>% filter(group=="Bipolar Disorder")
Rows: 119958 Columns: 20
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr  (4): gene_id, group, damaging_missense_fisher_gnom_non_psych_OR, ptv_fi...
dbl (16): n_cases, n_controls, damaging_missense_case_count, damaging_missen...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
rareVar.logit$BIP.bipex = -log10(BIP.bipex$ptv_fisher_gnom_non_psych_pval[match(geneAnno$ensg,BIP.bipex$gene_id)])
rareVar.binary$BIP.bipex = as.numeric(BIP.bipex$ptv_fisher_gnom_non_psych_pval[match(geneAnno$ensg,BIP.bipex$gene_id)] < 0.01 )

EPI.epi25 = read_tsv('ref/risk_genes/Epi25_gene_results.tsv') %>% filter(group=="EPI")
Rows: 71456 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): gene_id, group
dbl (9): xcase_lof, xctrl_lof, pval_lof, xcase_mpc, xctrl_mpc, pval_mpc, xca...
lgl (1): pval_infrIndel

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
rareVar.logit$EPI.epi25 = -log10(EPI.epi25$pval[match(geneAnno$ensg,EPI.epi25$gene_id)])
rareVar.binary$EPI.epi25 = as.numeric(EPI.epi25$pval[match(geneAnno$ensg,EPI.epi25$gene_id)] < .01)

apply(rareVar.binary,2, table)
  ASD.SFARI.1 ASD.SFARI.1or2 ASD.SFARI.S DDD.kaplanis EPI.helbig ASD.fuTADA
0       19853          19180       19913        19774      19941      17750
1         213            889         153          292        125        255
  NDD.fuTADA SCZ.schema BIP.bipex EPI.epi25
0      17188      17894     17341     13531
1        817         34        33        15
rareVar.logit %>% as_tibble()
# A tibble: 62,437 × 10
   ASD.SFARI.1 ASD.SFA…¹ ASD.S…² DDD.k…³ EPI.h…⁴ ASD.f…⁵ NDD.f…⁶ SCZ.s…⁷ BIP.b…⁸
         <dbl>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
 1          NA        NA      NA      NA      NA NA      NA           NA      NA
 2          NA        NA      NA      NA      NA NA      NA           NA      NA
 3          NA        NA      NA      NA      NA NA      NA           NA      NA
 4          NA        NA      NA      NA      NA NA      NA           NA      NA
 5          NA        NA      NA      NA      NA NA      NA           NA      NA
 6          NA        NA      NA      NA      NA NA      NA           NA      NA
 7           0         0       0       0       0  0.0190  0.0195      NA      NA
 8          NA        NA      NA      NA      NA NA      NA           NA      NA
 9          NA        NA      NA      NA      NA NA      NA           NA      NA
10          NA        NA      NA      NA      NA NA      NA           NA      NA
# … with 62,427 more rows, 1 more variable: EPI.epi25 <dbl>, and abbreviated
#   variable names ¹​ASD.SFARI.1or2, ²​ASD.SFARI.S, ³​DDD.kaplanis, ⁴​EPI.helbig,
#   ⁵​ASD.fuTADA, ⁶​NDD.fuTADA, ⁷​SCZ.schema, ⁸​BIP.bipex

Gene Feature Annotations

# Compute fetal brain gene expression
geneFeatures = cts %>% group_by(ensg=substr(annot_gene_id,1,15)) %>% 
  summarise(gene_counts = sum(counts)) %>% 
  mutate(tpm = gene_counts / (sum(gene_counts) / 1000000))

# Calculate number of transcripts and exons per gene in IsoSeq data
geneFeatures <-  as_tibble(talon_gtf)  %>% 
  mutate(ensg = str_sub(gene_id, 1, 15)) %>% group_by(ensg) %>%
  summarize(n_transcripts = n_distinct(na.omit(transcript_id)), n_exons = n_distinct(na.omit(exon_id))) %>%
  ungroup() %>% left_join(geneFeatures)
Joining, by = "ensg"
# Add gene length from Gencode
geneFeatures <- as_tibble(gencode_gtf) %>% dplyr::filter(type=="gene") %>% 
  mutate(ensg=substr(gene_id,0,15)) %>% dplyr::select(ensg, gene_length=width,gene_name) %>% distinct() %>%
  right_join(geneFeatures)
Joining, by = "ensg"
gencode_exons = gencode_gtf[gencode_gtf$type == "exon"]
gencode_exons_by_gene = split(gencode_exons, gencode_exons$gene_id)
geneLengths = enframe(
    sum(width(GenomicRanges::reduce(ranges(gencode_exons_by_gene)))),
    name = "gene_id",
    value = "coding_length"
) %>% left_join(
  enframe(
    max(end(gencode_exons_by_gene)) - min(start(gencode_exons_by_gene)) + 1,
    name = "gene_id",
    value = "talon_gene_length" # due to novel/unexpressed transcripts, talon gene length can differ from gencode gene length
  )
) %>% mutate(ensg = substr(gene_id, 1, 15), .keep = "unused") %>% distinct()
Joining, by = "gene_id"
geneFeatures <- geneFeatures %>% left_join(geneLengths)
Joining, by = "ensg"
# Add novel exons
novelExons = read.csv("data/working/novel_exons/mike_novel.csv")
geneFeatures$novelExon=0
geneFeatures$novelExon[geneFeatures$ensg %in% substr(novelExons$gene_id,1,15)]=1

# Add DTU/DTE/DGE
geneFeatures <- as_tibble(tableS3) %>% group_by(ensg) %>% summarise(DGE = any(DGE), DTU=any(DTU), DTE=any(DTE)) %>% right_join(geneFeatures)
Joining, by = "ensg"

Rare Variant Enrichment Analyses:

1) Transcript, exon number, novel exon

df_enrichments2 = data.frame()


# Interpretion of the odds ratio here:
# For every doubling of number of transcripts per gene (log2), there is 64% increase
# in odds that a gene is ASD risk gene (SFARI.1)
# 
# Dx          |   Feature           | Class   |   OR      | Z score   | P value |
# ------------| ------------------  | ------- | --------- | --------- | ---------- |
#   ASD.SFARI.1 |  n_transcripts_log2 | txAnno  | 1.6377222 | 9.9933917 | 1.629102e-23

for(i in 1:ncol(rareVar.binary)) {
  this_rareVar = rareVar.binary[geneFeatures$ensg,i]
  
  # Number of transcripts
  s=summary(glm(this_rareVar ~ log2(n_transcripts) + log10(gene_length) + log10(coding_length),
                data=geneFeatures, family='binomial'))
  coef = s$coefficients
  this_result = data.frame(dx = names(rareVar.binary)[i], feature= "n_transcripts_log2",  class='txAnno',
                           OR = exp(coef[2,1]), Z=coef[2,3], P=coef[2,4])
  df_enrichments2 = rbind(df_enrichments2, this_result)
  
    # Number of transcripts | expression
  s=summary(glm(this_rareVar ~ log2(n_transcripts) + log10(tpm) + log10(gene_length) + log10(coding_length),
                data=geneFeatures, family='binomial'))
  coef = s$coefficients
  this_result = data.frame(dx = names(rareVar.binary)[i], feature= "n_transcripts_log2|tpm",  class='txAnno',
                           OR = exp(coef[2,1]), Z=coef[2,3], P=coef[2,4])
  df_enrichments2 = rbind(df_enrichments2, this_result)
  
  # Number of exons
  s=summary(glm(this_rareVar ~ log2(n_exons) + log10(gene_length) + log10(coding_length),
                data=geneFeatures, family='binomial'))
  coef = s$coefficients
  this_result = data.frame(dx = names(rareVar.binary)[i], feature= "n_exons_log2",  class='txAnno',
                           OR = exp(coef[2,1]), Z=coef[2,3], P=coef[2,4])
  df_enrichments2 = rbind(df_enrichments2, this_result)
  
  # Gene has a novel exon
  s=summary(glm(this_rareVar ~ novelExon + log10(gene_length) + log10(coding_length),
                data=geneFeatures, family='binomial'))
  coef = s$coefficients
  this_result = data.frame(dx = names(rareVar.binary)[i], feature= "novel_exon",   class='txAnno',
                           OR = exp(coef[2,1]), Z=coef[2,3], P=coef[2,4])
  df_enrichments2 = rbind(df_enrichments2, this_result)
  
}

df_enrichments2 %>%
  mutate(fdr = p.adjust(P, 'fdr')) %>%
  ggplot(aes(
    x = -log10(fdr),
    y = reorder_within(feature, -log10(fdr), dx)
  )) +
  geom_col(fill = 'royalblue') + 
  geom_vline(xintercept = -log10(.05), lty = 2, color = 'red') +
  scale_y_reordered() +
  labs(
    y = NULL,
    title = "Rare Variant Gene Enrichments",
    subtitle = "Gene features from Iso-Seq"
  ) +
  theme_bw() +
  facet_wrap(vars(dx), scales = 'free_y', ncol=2)

ggsave("output/figures/revision1/Fig6/revisedFinal/Fig6_rareVar01.pdf", width = 6, height = 6)

2) DGE/DTE/DTU genes

diffExMat = data.frame(DTU = as.numeric(geneFeatures$DTU), 
                       DTE = as.numeric(geneFeatures$DTE),
                       DGE=as.numeric(geneFeatures$DGE))
rownames(diffExMat) = geneFeatures$ensg
diffExMat$DTU.not.DGE = as.numeric(geneFeatures$DTU & !geneFeatures$DGE)
diffExMat$DTU.not.DTE = as.numeric(geneFeatures$DTU & !geneFeatures$DTE)
diffExMat$DTE.not.DGE = as.numeric(geneFeatures$DTE & !geneFeatures$DGE)
diffExMat$DTU.up = as.numeric(rownames(diffExMat) %in% (tableS3 %>% filter(DTU, DTU_dIF>0) %>% dplyr::select(ensg) %>% pull()))
diffExMat$DTU.down = as.numeric(rownames(diffExMat) %in% (tableS3 %>% filter(DTU, DTU_dIF<0) %>% dplyr::select(ensg) %>% pull()))
diffExMat$DTE.up = as.numeric(rownames(diffExMat) %in% (tableS3 %>% filter(DTE, DTE_log2FC>0) %>% dplyr::select(ensg) %>% pull()))
diffExMat$DTE.down = as.numeric(rownames(diffExMat) %in% (tableS3 %>% filter(DTE, DTE_log2FC<0) %>% dplyr::select(ensg) %>% pull()))
diffExMat$DGE.up = as.numeric(rownames(diffExMat) %in% (tableS3 %>% filter(DGE, DGE_log2FC>0) %>% dplyr::select(ensg) %>% pull()))
diffExMat$DGE.down = as.numeric(rownames(diffExMat) %in% (tableS3 %>% filter(DGE, DGE_log2FC<0) %>% dplyr::select(ensg) %>% pull()))

df_enrichments.de2 = data.frame()
for(i in 1:ncol(rareVar.binary)) {
  this_rareVar = rareVar.binary[geneFeatures$ensg,i]
  
  for(j in 1:ncol(diffExMat)) {
    this_feature = diffExMat[geneFeatures$ensg,j]
    
      s=summary(glm(this_rareVar ~ this_feature + log10(gene_length) + log10(coding_length),
                data=geneFeatures, family='binomial'))
       coef = s$coefficients
  this_result = data.frame(dx = names(rareVar.binary)[i], class='DGE/DTE/DTU',
                           feature= colnames(diffExMat)[j],
                           OR = exp(coef[2,1]), Z=coef[2,3], P=coef[2,4])
  df_enrichments.de2 = rbind(df_enrichments.de2, this_result)
      
  }
}
df_enrichments.de2 %>%
  mutate(fdr = p.adjust(P, 'fdr')) %>% 
  ggplot(aes(x=reorder_within(feature, -log10(fdr), dx), y=-log10(fdr), size=OR, alpha=(OR>1))) +
  geom_point(color='royalblue') +
  scale_x_reordered() +
#  ylim(0, 10) +
  facet_wrap(dx~.,scales = 'free_y',ncol=2) +
  coord_flip() +
  geom_hline(yintercept = -log10(.05),lty=2, color='red') +
  labs(
    x = NULL,
    title = "Rare Variant Gene Enrichments",
    subtitle = "Gene features from Iso-Seq"
  ) +
  theme_bw()
Warning: Using alpha for a discrete variable is not advised.

ggsave("output/figures/revision1/Fig6/revisedFinal/Fig6_rareVar02.pdf", width = 7, height = 11)
Warning: Using alpha for a discrete variable is not advised.
df_enrichments.de2$feature = factor(df_enrichments.de2$feature, levels=c("DGE", "DGE.down", "DGE.up", "DTE", "DTE.down", "DTE.up", "DTU", "DTU.down", "DTU.up", "DTU.not.DGE", "DTU.not.DTE", "DTE.not.DGE"))
df_enrichments.de2 %>%
  mutate(set= str_split_i(df_enrichments.de2$feature, '[.]',1)) %>%
  mutate(fdr = p.adjust(P, 'fdr')) %>%
  ggplot(aes(x=reorder_within(dx, -log10(fdr), feature), y=-log10(fdr), fill=set)) +
  geom_bar(stat='identity') +
  scale_x_reordered() +
#  ylim(0, 10) +
  facet_wrap(feature~.,scales = 'free_y',ncol=3) +
  coord_flip() +
  geom_hline(yintercept = -log10(.05),lty=2, color='red') +
  labs(x="") +
  ggtitle("Rare Variant Gene Enrichments",subtitle = 'CP/GZ differential expression') +
  theme_bw()

ggsave("output/figures/revision1/Fig6/revisedFinal/Fig6_rareVar03.pdf", width = 9, height = 8.5)
df_enrichments.de2 %>%
  mutate(fdr=p.adjust(P,'fdr')) %>%
  ggplot(aes(y=dx, x=feature,label=ifelse(fdr<.05,signif(OR,2), ""), fill=-log10(fdr))) +
  geom_tile() +
  geom_text() +
  scale_fill_gradient(
    low='white',
    high='red',
#    limits=c(0,10)
  ) +
  scale_x_discrete(expand = c(0, 0), guide = guide_axis(angle = -90)) +
  scale_y_discrete(expand = c(0, 0)) +
  theme_bw()

ggsave("output/figures/revision1/Fig6/revisedFinal/Fig6_rareVar04.pdf", width = 6, height = 5)

3) Network Modules

kMEmat = tidyMods %>% filter(module.color != 'grey') %>% pivot_wider(id_cols = ensg, names_from = module,values_from = kME, values_fn = 'max', values_fill = 0) %>% as_data_frame()
Warning: `as_data_frame()` was deprecated in tibble 2.0.0.
ℹ Please use `as_tibble()` instead.
ℹ The signature and semantics have changed, see `?as_tibble`.
these_genes = kMEmat$ensg
kMEmat = as.data.frame(kMEmat)[,-1]
rownames(kMEmat) = these_genes
kMEmat[kMEmat!=0] = 1


this_net_enrichment = function(i, j) {
  this_feature = scale(kMEmat[geneFeatures$ensg,j],center = F)
  this_rareVar = rareVar.binary[geneFeatures$ensg,i]
  s=summary(glm(this_rareVar ~ this_feature + log10(gene_length) + log10(coding_length),
                data=geneFeatures, family='binomial'))
  coef = s$coefficients
  this_result = data.frame(dx = names(rareVar.binary)[i], class='Networks',
                           feature= colnames(kMEmat)[j],
                           OR = exp(coef[2,1]), Z=coef[2,3], P=coef[2,4])
  return(this_result)
}



df_enrichments.net2 = data.frame()
for(i in 1:ncol(rareVar.binary)) {
  print(i)
  this_df_enrichment = do.call("rbind", mclapply(1:105,  this_net_enrichment, i=i,mc.cores = detectCores()))
  df_enrichments.net2 = rbind(df_enrichments.net2, this_df_enrichment)
  }
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10

–> Dendrogram Plot

library(pheatmap)
net = df_enrichments.net2 %>% pivot_wider(id_cols = feature, names_from = dx, values_from = P) %>% as.data.frame()
rownames(net) = net$feature
net = -log10(net[,-1])

anno = data.frame(network = strsplit2(rownames(net),"[.]")[,1])
rownames(anno) = rownames(net)

pheatmap(t(net),clustering_method = 'average', border_color = 'grey60', annotation_col = anno,color = blueWhiteRed(100)[50:100])

pheatmap(t(net), clustering_method = 'average', border_color = 'grey60',
         annotation_col = anno,color = blueWhiteRed(100)[51:100],
         file = "output/figures/revision1/Fig6/revisedFinal/Fig6_rareVar05.pdf", width = 14, height = 7,
         fontsize = 10, fontsize_col = 8)

4) Single cell markers

cell_type_order = c("vRG", "oRG", "PgS", "PgG2M", "IP", "vRG-ExN", "ExN-1", "ExN-2", "ExN-3", "ExN-ExM", "ExM", "ExM-U", "ExDp", "In")
tableS5C = openxlsx::read.xlsx('data/single_cell/Table_S5_v2.xlsx',sheet = 'C')

if(FALSE) {
  seurat = read_rds('data/single_cell/seurat/scIsoseq_4kcells_final.rds')
  seurat_background = str_split_fixed(seurat@assays$RNA@data@Dimnames[[1]],'-',n = 2)[,1]
  write_csv(data.frame(gene=seurat_background), file='output/tables/tableS5/seurat_background.csv')
}
seurat_background = read_csv('data/single_cell/seurat_background.csv')$gene
Rows: 87162 Columns: 1
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): gene

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tableS5C = rbind(tableS5C, data.frame(Gene.Name=unique(seurat_background[!seurat_background %in% tableS5C$Gene.Name]), Isoform.Id=NA, cluster=NA, avg_log2FC=NA, p_val=NA, p_val_adj=NA))
tableS5C <- tableS5C %>% left_join(geneAnno %>% dplyr::select(ensg, gene_name), by=c("Gene.Name" = "gene_name"))

df.cell_makers <- tableS5C %>% pivot_wider(id_cols = ensg, names_from = cluster, values_from = avg_log2FC,values_fill = 0,values_fn = function(x) {max(x)}) %>% filter(!is.na(ensg)) %>% as.data.frame()
rownames(df.cell_makers) = df.cell_makers$ensg
df.cell_makers = df.cell_makers[,!colnames(df.cell_makers) %in% c("ensg", "NA")]

df.cell_makers[df.cell_makers!=0]=1
df_enrichments.singlecell2 = data.frame()
for (i in 1:ncol(rareVar.binary)) {
  print(i)
  this_rareVar = rareVar.binary[geneFeatures$ensg,i]
  
  for(j in 1:ncol(df.cell_makers)) {
    this_feature = df.cell_makers[geneFeatures$ensg,j]
    
      s=summary(glm(this_rareVar ~ this_feature + log10(gene_length) + log10(coding_length),
                data=geneFeatures, family='binomial'))
       coef = s$coefficients
  this_result = data.frame(dx = names(rareVar.binary)[i], class='SingleCellMarkers',
                           feature= colnames(df.cell_makers)[j],
                           OR = exp(coef[2,1]), Z=coef[2,3], P=coef[2,4])
  df_enrichments.singlecell2 = rbind(df_enrichments.singlecell2, this_result)
      
  }
}
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
df_enrichments.singlecell2 %>% mutate(fdr = p.adjust(P, 'fdr'), feature = feature %>% as_factor() %>% fct_relevel(cell_type_order) %>% fct_rev()) %>% 
    ggplot(aes(x=feature, y=-log10(fdr)))+ geom_bar(stat='identity',fill='royalblue') + 
    ylim(0, 6) +
    facet_wrap(dx~.,scales = 'free_y',ncol=3) + coord_flip() + geom_hline(yintercept = -log10(.05),lty=2, color='red') +
    labs(x="") + ggtitle("Rare Variant Gene Enrichments",subtitle = 'Single cell markers (logit)') +
    theme_bw()

#ggsave("output/figures/revision1/Fig6/Fig6_rareVar06_for_revision.pdf", width = 7, height = 9)
# run subsequent cells to get SingleCellDTU bar
if (FALSE) {
  ggsave("output/figures/revision1/Fig6/revisedFinal/Fig6_rareVar06.pdf", width = 7, height = 9)
}

5) Single Cell DTU genes

tableS5E = openxlsx::read.xlsx('data/single_cell/Table_S5_v2.xlsx',sheet = 'E')
tableS5E <- tableS5E %>% filter(over.all < .05) %>% dplyr::select(gene_name=Gene)
tableS5E$marker = 1

tableS5E = rbind(tableS5E, data.frame(gene_name=unique(seurat_background[!seurat_background %in% tableS5E$gene_name]), marker = 0))
tableS5E <- tableS5E %>% left_join(geneAnno %>% dplyr::select(ensg, gene_name))
Joining, by = "gene_name"
tableS5E = as.data.frame(tableS5E[match(na.omit(unique(tableS5E$ensg)), tableS5E$ensg),-1])

rownames(tableS5E) = tableS5E$ensg

this_feature = tableS5E[geneFeatures$ensg,"marker"]


for (i in 1:ncol(rareVar.binary)) {
  print(i)
  this_rareVar = rareVar.binary[geneFeatures$ensg,i]
  

      s=summary(glm(this_rareVar ~ this_feature + log10(gene_length) + log10(coding_length),
                data=geneFeatures, family='binomial'))
       coef = s$coefficients
  this_result = data.frame(dx = names(rareVar.binary)[i], class='SingleCellMarkers',
                           feature= "SingleCellDTU",
                           OR = exp(coef[2,1]), Z=coef[2,3], P=coef[2,4])
  df_enrichments.singlecell2 = rbind(df_enrichments.singlecell2, this_result)
      
}
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
df_enrichments.singlecell2 %>% filter(feature=="SingleCellDTU") %>% mutate(fdr = p.adjust(P, 'fdr')) %>% group_by(dx)   %>% 
    ggplot(aes(x=reorder(dx, -log10(fdr)), y=-log10(fdr)))+ geom_bar(stat='identity',fill='royalblue') + 
#    ylim(0, 3) +
    coord_flip() + geom_hline(yintercept = -log10(.05),lty=2, color='red') +
    labs(x="") + ggtitle("Rare Variant Gene Enrichments",subtitle = 'Single cell DTU genes (logit)') + theme_bw()

ggsave("output/figures/revision1/Fig6/revisedFinal/Fig6_rareVar07.pdf", width = 4, height = 4)

Combine All

tableS6v2 = rbind(df_enrichments2, df_enrichments.de2, df_enrichments.net2, df_enrichments.singlecell2)
tableS6v2$fdr = p.adjust(tableS6v2$P, 'fdr')
tableS6v2 <- tableS6v2 %>% arrange(fdr)
idx = which(tableS6v2$class=="Networks")
tableS6v2$class[idx] = paste0("Networks.", str_split_fixed(tableS6v2$feature[idx], '[.]',n = 2)[,1])
tableS6v2 %>%
  filter(OR > 1) %>%
  arrange(dx, fdr, desc(OR)) %>%
  group_by(dx) %>%
  slice_head(n = 10) %>%
#  filter(row_number() <= 10 | fdr < .05) %>% # ASD.fuTADA, DDD.kaplanis, NDD.fuTADA have more than 10 hits
  ggplot(aes(
    x = -log10(fdr),
    y = reorder_within(feature, -log10(fdr), dx),
    fill = class
  )) +
  geom_col() +
  geom_vline(xintercept = -log10(.05), lty=2, color='red') +
  scale_y_reordered() +
  labs(
    y = NULL,
    title = "Rare Variant Enrichments",
    subtitle = "Top 10 categories per rare variant dx"
  ) +
  theme_bw() +
  facet_wrap(vars(dx), scales = 'free_y', ncol = 2)

ggsave("output/figures/revision1/Fig6/revisedFinal/Fig6_rareVar08.pdf", width = 8.5, height = 11)
tableS6v2 %>%
  filter(OR > 1, fdr < .4) %>%
  group_by(dx) %>%
  slice_min(order_by = fdr, n = 10) %>%
  ggplot(aes(x=reorder_within(feature, -log10(fdr), class), y=-log10(fdr),fill=dx)) +
  geom_col(position = position_dodge()) +
  scale_x_reordered() +
  facet_wrap(class~.,scales = 'free',ncol=2) +
  coord_flip() +
  geom_hline(yintercept = -log10(.05),lty=2, color='red') +
  labs(x="") +
  ggtitle("Rare Variant Enrichments")

ggsave("output/figures/revision1/Fig6/revisedFinal/Fig6_rareVar09.pdf", width = 9, height = 8.5)
tableS6v2 %>%
  filter(OR > 1, fdr < .1, dx=="NDD.fuTADA") %>% 
  ggplot(aes(x=reorder(feature, -log10(fdr)), y=-log10(fdr),fill=class)) +
  geom_bar(stat='identity',) +
  coord_flip() +
  geom_hline(yintercept = -log10(.05),lty=2, color='red') +
  ggtitle("NDD_fuTADA") +
  theme_bw() +
  labs(x="")

ggsave("output/figures/revision1/Fig6/revisedFinal/Fig6_rareVar10.pdf", width = 8.5, height = 11)
tableS6v2 %>%
  filter(OR > 1, fdr < .1, dx=="ASD.fuTADA") %>% 
  ggplot(aes(x=reorder(feature, -log10(fdr)), y=-log10(fdr),fill=class)) +
  geom_bar(stat='identity',) +
  coord_flip() +
  geom_hline(yintercept = -log10(.05),lty=2, color='red') +
  ggtitle("ASD_fuTADA") +
  theme_bw() +
  labs(x="")

ggsave("output/figures/revision1/Fig6/revisedFinal/Fig6_rareVar11.pdf", width = 7, height = 6)
tableS6v2 %>%
  filter(OR > 1, fdr < .1, dx=="DDD.kaplanis") %>% 
  ggplot(aes(x=reorder(feature, -log10(fdr)), y=-log10(fdr),fill=class)) +
  geom_bar(stat='identity',) +
  coord_flip() +
  geom_hline(yintercept = -log10(.05),lty=2, color='red') +
  ggtitle("DDD_kaplanis") +
  theme_bw() +
  labs(x="")

ggsave("output/figures/revision1/Fig6/revisedFinal/Fig6_rareVar12.pdf", width = 7, height = 6)
write_tsv(tableS6v2,file="output/figures/revision1/Fig6/revisedFinal/tableS6v2_rareVariants.tsv")
tableS6v2 %>%
  filter(OR > 1) %>%
  group_by(class) %>%
  slice_min(order_by = fdr,n = 10)  %>% 
  ggplot(aes(x=reorder_within(feature, -log10(fdr), class), y=-log10(fdr),fill=dx)) +
  geom_bar(stat='identity',position=position_dodge()) +
  facet_wrap(class~.,scales = 'free_y',ncol=2) +
  scale_x_reordered() +
  coord_flip() +
  geom_hline(yintercept = -log10(.05),lty=2, color='red') +
  labs(x="") +
  ggtitle("Rare Variant Enrichments", subtitle = "Net")

ggsave("output/figures/revision1/Fig6/revisedFinal/Fig6_rareVar13.pdf", width = 9, height = 8.5)