Figure 1 - BulkTxomeAnalysis REVISION1

Author

Michael Gandal

suppressPackageStartupMessages({
  library(IsoformSwitchAnalyzeR)
  library(rtracklayer)
  library(ggrepel)
  library(scales)
  library(GenomicFeatures)
  library(DescTools)
  library(tidyverse)
  library(magrittr)
})

colorVector = c(
  "Known" = "#009E73",
  "ISM"   = "#0072B2",
  "ISM_Prefix" = "#005996",
  "ISM_Suffix" = "#378bcc",
#  "ISM_Both"   = "#0072B2",
  "NIC"   = "#D55E00",
  "NNC"   = "#E69F00",
  "Other" = "#000000"
)
colorVector_ismSplit = colorVector[-2]

Load Data

if(!file.exists("data/working/bulkTxome.Rdata")) {
  talon_gtf = rtracklayer::import("data/cp_vz_0.75_min_7_recovery_talon.gtf.gz")
  tx.isoseq =  talon_gtf %>% as_tibble() %>% filter(type == "transcript") 
  
  sqanti_gtf = rtracklayer::import("data/sqanti/cp_vz_0.75_min_7_recovery_talon_corrected.gtf.cds.gtf.gz")
  tx.sqanti = sqanti_gtf %>% as_tibble() %>% filter(type == "transcript")
  
  gencode_gtf = rtracklayer::import("ref/gencode.v33lift37.annotation.gtf.gz") 
  tx.gencode =  gencode_gtf %>% as_tibble() %>% filter(type == "transcript")
  
  txdb.gencode = makeTxDbFromGRanges(gencode_gtf)
  gencodelengths= transcriptLengths(txdb.gencode)
  
  txdb.isoseq = makeTxDbFromGRanges(talon_gtf)
  isoSeqLengths = transcriptLengths(txdb.isoseq)
  samps = tribble( 
    ~sample_id, ~condition,
    "VZ_209", "VZ",
    "VZ_334", "VZ",
    "VZ_336", "VZ",
    "CP_209", "CP",
    "CP_334", "CP",
    "CP_336", "CP"
  ) %>%
    dplyr::mutate(
      dplyr::across(condition, as_factor)
    )
  
  cts = read_table("data/cp_vz_0.75_min_7_recovery_talon_abundance_filtered.tsv.gz")
  cts.collapse = cts %>%
    mutate(
      VZ_209 = rowSums(across(matches("209_.*_VZ"))),
      VZ_334 = rowSums(across(matches("334_.*_VZ"))),
      VZ_336 = rowSums(across(matches("336_.*_VZ"))),
      CP_209 = rowSums(across(matches("209_.*_CP"))),
      CP_334 = rowSums(across(matches("334_.*_CP"))),
      CP_336 = rowSums(across(matches("336_.*_CP"))),
      .keep = "unused"
    ) %>%
    dplyr::select(!c("gene_ID", "transcript_ID", "annot_transcript_name")) %>%
    dplyr::rename(
      gene_id = "annot_gene_id",
      transcript_id = "annot_transcript_id",
      gene_name = "annot_gene_name"
    ) %>%
    mutate(
      gene_novelty = as.factor(gene_novelty),
      transcript_novelty = as.factor(transcript_novelty),
      ISM_subtype = ISM_subtype %>% na_if("None") %>% as.factor()
    )
  cts$counts = rowSums(as.matrix(cts.collapse[,9:14]))
  
  cts$novelty2 = as.character(cts$transcript_novelty)
  cts$novelty2[which(cts$novelty2=="ISM" & cts$ISM_subtype=="Prefix")] = "ISM_Prefix"
  cts$novelty2[which(cts$novelty2=="ISM" & cts$ISM_subtype=="Suffix")] = "ISM_Suffix"
  cts$novelty2[cts$novelty2 %in% c("Antisense", "Genomic", "Intergenic", "ISM")] = "Other"
  cts$novelty2 = factor(cts$novelty2,levels=c("Known", "ISM_Prefix", "ISM_Suffix", "NIC", "NNC", "Other"))
  
  
  TableS1 = tx.isoseq %>% dplyr::select(gene_id, transcript_id, gene_name, transcript_name, seqnames, start, end, strand, transcript_length=width, source, gene_status, gene_type, transcript_status,transcript_type,  havana_transcript, ccdsid, protein_id)
  TableS1 = TableS1 %>% left_join(cts %>% dplyr::select(transcript_id=annot_transcript_id, transcript_novelty, ISM_subtype, transcript_novelty2 = novelty2, n_exons, cds_length = length, expression_counts = counts))
  TableS1$expression_TPM = TableS1$expression_counts / (sum(TableS1$expression_counts / 1000000))
  write_tsv(TableS1, file="output/tables/TableS1_transcript_annotation.tsv")
  save.image("data/working/bulkTxome.Rdata")
} else {
  load("data/working/bulkTxome.Rdata")
}

Analyses of transcripts per gene & disease

NDD risk genes ~ unique transcipts per gene

source("code/risk_genes.R")
Warning: One or more parsing issues, see `problems()` for details
Rows: 18321 Columns: 26
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr  (8): gene_id, group, OR (PTV), OR (Class I), OR (Class II), OR (PTV) up...
dbl (16): Case PTV, Ctrl PTV, Case mis3, Ctrl mis3, Case mis2, Ctrl mis2, P ...
lgl  (2): De novo mis3, De novo mis2

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 119958 Columns: 20
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr  (4): gene_id, group, damaging_missense_fisher_gnom_non_psych_OR, ptv_fi...
dbl (16): n_cases, n_controls, damaging_missense_case_count, damaging_missen...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 71456 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): gene_id, group
dbl (9): xcase_lof, xctrl_lof, pval_lof, xcase_mpc, xctrl_mpc, pval_mpc, xca...
lgl (1): pval_infrIndel

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
risk_genes = read.csv("ref/ASD+SCZ+DDD_2022.csv")
pLI_scores = read.table('ref/pLI_scores.ensid.txt',header = T)
asd_genes = risk_genes$Gene[risk_genes$Set=="ASD (SFARI score 1)"]
ddd_genes = risk_genes$Gene[risk_genes$Set=="DDD (Kaplanis et al. 2019)"]

geneCounts = cts %>% group_by(gene_id=substr(annot_gene_id,1,15)) %>% summarise(gene_count = sum(counts))
geneCounts$gene_count = geneCounts$gene_count / (sum(geneCounts$gene_count) / 1000000)

talon_exons = talon_gtf[talon_gtf$type == "exon"]
#talon_exons_novel = talon_gtf[talon_gtf$type == "exon" & talon_gtf$transcript_status == "NOVEL"]
talon_exons_by_gene = split(talon_exons, talon_exons$gene_id)
#talon_exons_by_gene_novel = split(talon_exons_novel, talon_exons_novel$gene_id)
geneLengths = enframe(
  sum(width(GenomicRanges::reduce(ranges(talon_exons_by_gene)))),
  name = "gene_id",
  value = "coding_length"
) %>%
  left_join(
    enframe(
      max(end(talon_exons_by_gene)) - min(start(talon_exons_by_gene)) + 1,
      name = "gene_id",
      value = "talon_width" # due to novel/unexpressed transcripts, talon gene width can differ from gencode gene width
    )
  ) %>%
#  left_join(
#    enframe(
#      sum(width(GenomicRanges::reduce(ranges(talon_exons_by_gene_novel)))),
#      name = "gene_id",
#      value = "coding_length_novel"
#    )
#  ) %>%
  mutate(gene_id = substr(gene_id, 1, 15))
Joining, by = "gene_id"
df <- talon_gtf %>% as_tibble()  %>% 
  mutate(gene_id = str_sub(gene_id, 1, 15)) %>%
  group_by(gene_id) %>%
  summarize(n_transcripts = n_distinct(na.omit(transcript_id)), n_exons = n_distinct(na.omit(exon_id))) %>%
  ungroup() 

df <- as_tibble(gencode_gtf) %>% dplyr::filter(type=="gene") %>% mutate(gene_id=substr(gene_id,0,15)) %>% right_join(df, by="gene_id")
df <- df %>% left_join(geneCounts)
Joining, by = "gene_id"
df <- df %>% left_join(geneLengths)
Joining, by = "gene_id"
df <- pLI_scores %>% as_tibble() %>% dplyr::select(gene_id=gene, pLI) %>% right_join(df)
Joining, by = "gene_id"
df$gene_rank = rank(-df$n_transcripts, ties.method = 'first')

df$DDD = FALSE
df$DDD[df$gene_name %in% c(asd_genes, ddd_genes)] = TRUE

df = rareVar.binary %>% as_tibble(rownames = "gene_id") %>% right_join(df)
Joining, by = "gene_id"
s=summary(glm(NDD.fuTADA ~ log10(n_transcripts)  + log10(width)  + log10(gene_count) + log10(coding_length), data=df %>% filter(gene_type == "protein_coding"), family='binomial'))
print(s)

Call:
glm(formula = NDD.fuTADA ~ log10(n_transcripts) + log10(width) + 
    log10(gene_count) + log10(coding_length), family = "binomial", 
    data = df %>% filter(gene_type == "protein_coding"))

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.1142  -0.3808  -0.2840  -0.1897   3.5966  

Coefficients:
                      Estimate Std. Error z value Pr(>|z|)    
(Intercept)          -11.13478    0.61994 -17.961  < 2e-16 ***
log10(n_transcripts)   0.44281    0.15209   2.912  0.00360 ** 
log10(width)           0.47907    0.08761   5.468 4.54e-08 ***
log10(gene_count)      0.21934    0.07888   2.781  0.00542 ** 
log10(coding_length)   1.39117    0.20486   6.791 1.11e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 6026.3  on 13953  degrees of freedom
Residual deviance: 5515.8  on 13949  degrees of freedom
  (1193 observations deleted due to missingness)
AIC: 5525.8

Number of Fisher Scoring iterations: 6
exp(s$coefficients[,1])
         (Intercept) log10(n_transcripts)         log10(width) 
        1.459579e-05         1.557078e+00         1.614573e+00 
   log10(gene_count) log10(coding_length) 
        1.245254e+00         4.019550e+00 
Fig1H = df %>% mutate(NDD.fuTADA = NDD.fuTADA %>% as.logical() %>% replace_na(F)) %>%
  ggplot(aes(x = gene_rank, y = n_transcripts, color=NDD.fuTADA)) +
  geom_point() + geom_line(color='blue') + 
  geom_label_repel(data = . %>% filter(n_transcripts > 150 | (n_transcripts > 80 & NDD.fuTADA==TRUE)),aes(label = gene_name),force = 30, direction='both',nudge_y=-.1,nudge_x = .3, max.iter = 10000,max.overlaps = 50, size=2.5) + scale_color_manual(values=c("TRUE" = "red", "FALSE" = "black")) + scale_y_log10() + scale_x_log10() + theme_bw() + annotation_logticks() + theme(legend.position = 'none') + labs(x="Gene rank", y="# Transcripts") + ggtitle("NDD risk genes ~ unique transcripts per gene",subtitle=paste0("OR ",signif(exp(s$coefficients[2,1]),3),", P=", signif(s$coefficients[2,4],2)))
Fig1H

ggsave(file="output/figures/revision1/Fig1K_codingLen_NDD.fuTADA.pdf",Fig1H, width = 8, height=3)

n_exons and n_transcripts are closely correlated:

summary(glm(
  DDD ~ log10(width)  + log10(gene_count) + log10(coding_length) + log10(n_exons),
  data=df %>% filter(gene_type == "protein_coding"),
  family='binomial'
))

Call:
glm(formula = DDD ~ log10(width) + log10(gene_count) + log10(coding_length) + 
    log10(n_exons), family = "binomial", data = df %>% filter(gene_type == 
    "protein_coding"))

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.9874  -0.2504  -0.1683  -0.0992   3.6740  

Coefficients:
                      Estimate Std. Error z value Pr(>|z|)    
(Intercept)          -13.93640    0.85248 -16.348  < 2e-16 ***
log10(width)           0.65078    0.12899   5.045 4.53e-07 ***
log10(gene_count)      0.34755    0.09001   3.861 0.000113 ***
log10(coding_length)   1.25954    0.30160   4.176 2.96e-05 ***
log10(n_exons)         1.33657    0.27354   4.886 1.03e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3522.0  on 15146  degrees of freedom
Residual deviance: 3099.3  on 15142  degrees of freedom
AIC: 3109.3

Number of Fisher Scoring iterations: 7
exp(1.33657)
[1] 3.805967
summary(glm(
  DDD ~ log10(n_transcripts)  + log10(width)  + log10(gene_count) + log10(coding_length),
  data=df %>% filter(gene_type == "protein_coding"),
  family='binomial'
))

Call:
glm(formula = DDD ~ log10(n_transcripts) + log10(width) + log10(gene_count) + 
    log10(coding_length), family = "binomial", data = df %>% 
    filter(gene_type == "protein_coding"))

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.0179  -0.2493  -0.1694  -0.1034   3.6147  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)          -14.0374     0.8930 -15.720  < 2e-16 ***
log10(n_transcripts)   0.6113     0.2189   2.792  0.00523 ** 
log10(width)           0.6958     0.1267   5.493 3.96e-08 ***
log10(gene_count)      0.3081     0.1164   2.646  0.00813 ** 
log10(coding_length)   1.5574     0.2946   5.286 1.25e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3522  on 15146  degrees of freedom
Residual deviance: 3117  on 15142  degrees of freedom
AIC: 3127

Number of Fisher Scoring iterations: 7
exp(0.6113)
[1] 1.842826
mycor = df %>% transmute(
  n_tx_log10 = log10(n_transcripts),
  width_log10 = log10(talon_width),
  gene_count_log10 = log10(gene_count),
  coding_length_log10 = log10(coding_length), 
  n_exons_log10 = log10(n_exons)
) %>% cor(method = "spearman")
mycor
                    n_tx_log10 width_log10 gene_count_log10 coding_length_log10
n_tx_log10           1.0000000   0.6774404        0.8713938           0.8105970
width_log10          0.6774404   1.0000000        0.5917691           0.8174833
gene_count_log10     0.8713938   0.5917691        1.0000000           0.7530141
coding_length_log10  0.8105970   0.8174833        0.7530141           1.0000000
n_exons_log10        0.9222081   0.7889066        0.7849993           0.8350145
                    n_exons_log10
n_tx_log10              0.9222081
width_log10             0.7889066
gene_count_log10        0.7849993
coding_length_log10     0.8350145
n_exons_log10           1.0000000
myorder = colnames(mycor)
mycor %>%
  as_tibble(rownames = "var1") %>%
  pivot_longer(cols = !var1, names_to = "var2", values_to = "spearman") %>%
  mutate(across(c(var1, var2), factor, levels = myorder)) %>%
  ggplot(aes(x=var1, y=var2, fill=spearman)) +
  scale_fill_gradient(low = "white", high = "red") +
  geom_tile() +
  geom_text(aes(label = scales::number(spearman, accuracy = 0.01))) +
  scale_x_discrete(guide = guide_axis(angle = 45)) +
  scale_y_discrete(limits = rev)

ggsave(file="output/figures/revision1/Fig1K_correlations.pdf", width = 5, height=4)

FigS3: NDD risk genes ~ unique NOVEL transcipts per gene

df.novel <- talon_gtf %>% as_tibble()  %>% filter(type=="transcript", transcript_id %in% cts$annot_transcript_id[cts$novelty2!="Known"]) %>% 
  mutate(gene_id = str_sub(gene_id, 1, 15)) %>%
  group_by(gene_id) %>%
  summarize(n_transcripts = n_distinct(na.omit(transcript_id)), n_exons = n_distinct(na.omit(exon_id))) %>%
  ungroup() 

df.novel <- as_tibble(gencode_gtf) %>% dplyr::filter(type=="gene") %>% mutate(gene_id=substr(gene_id,0,15)) %>% right_join(df.novel, by="gene_id")
df.novel <- df.novel %>% left_join(geneCounts)
Joining, by = "gene_id"
df.novel <- df.novel %>% left_join(geneLengths)
Joining, by = "gene_id"
df.novel$gene_rank = rank(-df.novel$n_transcripts, ties.method = 'first')
df.novel$DDD = FALSE
df.novel$DDD[df.novel$gene_name %in% c(asd_genes, ddd_genes)] = TRUE

df.novel = rareVar.binary %>% as_tibble(rownames = "gene_id") %>% right_join(df.novel)
Joining, by = "gene_id"
s=summary(glm(NDD.fuTADA ~ log10(n_transcripts)  + log10(width)  + log10(gene_count) + log10(coding_length), data=df.novel %>% filter(gene_type == "protein_coding"), family='binomial'))
print(s)

Call:
glm(formula = NDD.fuTADA ~ log10(n_transcripts) + log10(width) + 
    log10(gene_count) + log10(coding_length), family = "binomial", 
    data = df.novel %>% filter(gene_type == "protein_coding"))

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.1343  -0.4124  -0.3188  -0.2344   3.1335  

Coefficients:
                      Estimate Std. Error z value Pr(>|z|)    
(Intercept)          -11.16258    0.70920 -15.740  < 2e-16 ***
log10(n_transcripts)   0.37199    0.12866   2.891 0.003837 ** 
log10(width)           0.53523    0.09565   5.596 2.20e-08 ***
log10(gene_count)      0.34925    0.08999   3.881 0.000104 ***
log10(coding_length)   1.30727    0.22199   5.889 3.89e-09 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 5174.0  on 10453  degrees of freedom
Residual deviance: 4808.2  on 10449  degrees of freedom
  (751 observations deleted due to missingness)
AIC: 4818.2

Number of Fisher Scoring iterations: 6
sort(exp(s$coefficients[,1]))
         (Intercept)    log10(gene_count) log10(n_transcripts) 
        1.419559e-05         1.418007e+00         1.450619e+00 
        log10(width) log10(coding_length) 
        1.707839e+00         3.696057e+00 
FigS3 = df.novel %>% mutate(NDD.fuTADA = NDD.fuTADA %>% as.logical() %>% replace_na(F)) %>%
  ggplot(aes(x = gene_rank, y = n_transcripts, color=NDD.fuTADA)) +
  geom_point() + geom_line(color='blue') + 
  geom_label_repel(data = . %>% filter(n_transcripts > 150 | (n_transcripts > 75 & NDD.fuTADA==TRUE)),aes(label = gene_name),force = 30, direction='both',nudge_y=-.1,nudge_x = .3, max.iter = 10000,max.overlaps = 50, size=2.5) + scale_color_manual(values=c("TRUE" = "red", "FALSE" = "black")) + scale_y_log10() + scale_x_log10() + theme_bw() + annotation_logticks() + theme(legend.position = 'none') + labs(x="Gene rank", y="# Transcripts") + ggtitle("NDD risk genes ~ unique novel transcripts per gene",subtitle=paste0("OR ",signif(exp(s$coefficients[2,1]),3),", P=", signif(s$coefficients[2,4],2)))
FigS3

ggsave(file="output/figures/revision1/FigS3G_codingLen_NDD.fuTADA_6in.pdf",FigS3, width = 6, height=3)