Figure 1 - Table S1

Author

Connor Jops

library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.2.0      ✔ stringr 1.5.0 
✔ readr   2.1.2      ✔ forcats 0.5.2 
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

library(rtracklayer)

Loading required package: GenomicRanges
Loading required package: stats4
Loading required package: BiocGenerics

Attaching package: 'BiocGenerics'

The following objects are masked from 'package:dplyr':

    combine, intersect, setdiff, union

The following objects are masked from 'package:stats':

    IQR, mad, sd, var, xtabs

The following objects are masked from 'package:base':

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unsplit, which.max, which.min

Loading required package: S4Vectors

Attaching package: 'S4Vectors'

The following objects are masked from 'package:dplyr':

    first, rename

The following object is masked from 'package:tidyr':

    expand

The following objects are masked from 'package:base':

    expand.grid, I, unname

Loading required package: IRanges

Attaching package: 'IRanges'

The following objects are masked from 'package:dplyr':

    collapse, desc, slice

The following object is masked from 'package:purrr':

    reduce

Loading required package: GenomeInfoDb

theme_set(theme_bw())
theme_update(
  plot.title = element_text(size = rel(1.4), hjust = 0.5),
  axis.title = element_text(size = rel(1.2)),
  axis.text = element_text(color="black", size = rel(1)),
  legend.title = element_text(size = rel(1.2)),
  legend.text = element_text(color="black", size = rel(1)),
  strip.text = element_text(color="black", size = rel(1))
)

cts = read_table("data/cp_vz_0.75_min_7_recovery_talon_abundance_filtered.tsv.gz")


── Column specification ────────────────────────────────────────────────────────
cols(
  .default = col_double(),
  annot_gene_id = col_character(),
  annot_transcript_id = col_character(),
  annot_gene_name = col_character(),
  annot_transcript_name = col_character(),
  gene_novelty = col_character(),
  transcript_novelty = col_character(),
  ISM_subtype = col_character()
)
ℹ Use `spec()` for the full column specifications.

cts = cts %>%
  mutate(
    VZ_209 = rowSums(across(matches("209_.*_VZ"))),
    VZ_334 = rowSums(across(matches("334_.*_VZ"))),
    VZ_336 = rowSums(across(matches("336_.*_VZ"))),
    CP_209 = rowSums(across(matches("209_.*_CP"))),
    CP_334 = rowSums(across(matches("334_.*_CP"))),
    CP_336 = rowSums(across(matches("336_.*_CP"))),
    .keep = "unused"
  ) %>%
  select(!c("gene_ID", "transcript_ID", "annot_transcript_name")) %>%
  dplyr::rename(
    gene_id = "annot_gene_id",
    transcript_id = "annot_transcript_id",
    gene_name = "annot_gene_name"
  ) %>%
  mutate(
    gene_novelty = as.factor(gene_novelty) %>% fct_infreq() %>% fct_relevel("Known"),
    transcript_novelty = as.factor(transcript_novelty)  %>% fct_infreq(),
    ISM_subtype = ISM_subtype %>% na_if("None") %>% factor(levels = c("Prefix", "Suffix", "Both"))
  )
cts

# A tibble: 214,516 × 14
   gene_id  trans…¹ gene_…² n_exons length gene_…³ trans…⁴ ISM_s…⁵ VZ_209 VZ_334
   <chr>    <chr>   <chr>     <dbl>  <dbl> <fct>   <fct>   <fct>    <dbl>  <dbl>
 1 ENSG000… ENST00… AL6273…       1    755 Known   Known   <NA>         0      0
 2 ENSG000… ENST00… AP0062…       4   2257 Known   Known   <NA>         0      0
 3 ENSG000… ENST00… RP4-66…       1    180 Known   Known   <NA>         1      1
 4 ENSG000… ENST00… MTND2P…       1   1044 Known   Known   <NA>         0      1
 5 ENSG000… ENST00… MTCO1P…       1   1543 Known   Known   <NA>         0      0
 6 ENSG000… ENST00… LINC01…       3   1869 Known   Known   <NA>         0      2
 7 ENSG000… ENST00… LINC01…       2    566 Known   Known   <NA>         0      2
 8 ENSG000… ENST00… LINC01…       1   1873 Known   Known   <NA>         1      0
 9 ENSG000… ENST00… AL6698…       1    114 Known   Known   <NA>         3      0
10 ENSG000… ENST00… LINC01…       5   6616 Known   Known   <NA>         1      0
# … with 214,506 more rows, 4 more variables: VZ_336 <dbl>, CP_209 <dbl>,
#   CP_334 <dbl>, CP_336 <dbl>, and abbreviated variable names ¹transcript_id,
#   ²gene_name, ³gene_novelty, ⁴transcript_novelty, ⁵ISM_subtype

talon_gtf = rtracklayer::import("data/cp_vz_0.75_min_7_recovery_talon.gtf.gz")

talon_gtf = talon_gtf %>% as_tibble() %>% filter(type == "transcript")

sqanti = read_tsv("data/sqanti/cp_vz_0.75_min_7_recovery_talon_classification.txt.gz")

Rows: 214516 Columns: 48
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (14): isoform, chrom, strand, structural_category, associated_gene, asso...
dbl (21): length, exons, ref_length, ref_exons, diff_to_TSS, diff_to_TTS, di...
lgl (13): RTS_stage, FL, n_indels, n_indels_junc, bite, iso_exp, gene_exp, r...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

External support used for TALON whitelist creation (some is from our initial unfiltered run of SQANTI and some is from TALON scripts)

(there might be something wrong with this file - contains NAs)

whitelist_support = read_tsv("data/cp_vz_unfiltered_external_support.tsv.gz")

Rows: 214516 Columns: 22
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr  (7): annot_transcript_id, annot_gene_id, annot_gene_name, gene_novelty,...
dbl  (5): n_reads, n_donors, dist_to_CAGE_peak, dist_to_polyA_site, dist_to_...
lgl (10): within_CAGE_peak, within_polyA_site, polyA_motif_found, CAGE_suppo...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

whitelist_support

# A tibble: 214,516 × 22
   annot_trans…¹ annot…² annot…³ gene_…⁴ trans…⁵ ISM_s…⁶ n_reads n_don…⁷ dist_…⁸
   <chr>         <chr>   <chr>   <chr>   <chr>   <chr>     <dbl>   <dbl>   <dbl>
 1 ENST00000000… ENSG00… ARF5    Known   Known   <NA>        825       3       6
 2 ENST00000000… ENSG00… M6PR    Known   Known   <NA>       1955       3      19
 3 ENST00000000… ENSG00… ESRRA   Known   Known   <NA>         11       3       2
 4 ENST00000001… ENSG00… FKBP4   Known   Known   <NA>       2319       3     -30
 5 ENST00000001… ENSG00… CYP26B1 Known   Known   <NA>         63       3    6497
 6 ENST00000002… ENSG00… NDUFAF7 Known   Known   <NA>        158       3     -19
 7 ENST00000002… ENSG00… FUCA2   Known   Known   <NA>        663       3     -10
 8 ENST00000002… ENSG00… DBNDD1  Known   Known   <NA>        421       3       0
 9 ENST00000002… ENSG00… HS3ST1  Known   Known   <NA>       1351       3      -8
10 ENST00000003… ENSG00… CYP51A1 Known   Known   <NA>       6691       3     -22
# … with 214,506 more rows, 13 more variables: within_CAGE_peak <lgl>,
#   dist_to_polyA_site <dbl>, within_polyA_site <lgl>, polyA_motif <chr>,
#   dist_to_polyA_motif <dbl>, polyA_motif_found <lgl>, CAGE_support_100 <lgl>,
#   CAGE_support_250 <lgl>, CAGE_support_500 <lgl>, PAS_motif_support_35 <lgl>,
#   PAS_motif_support_50 <lgl>, PAS_motif_support_100 <lgl>,
#   long_read_db <lgl>, and abbreviated variable names ¹annot_transcript_id,
#   ²annot_gene_id, ³annot_gene_name, ⁴gene_novelty, ⁵transcript_novelty, …

sanity_check = whitelist_support %>% left_join(sqanti, by = c("annot_transcript_id" = "isoform"))
sanity_check %>% filter(within_CAGE_peak.x != within_CAGE_peak.y)

# A tibble: 0 × 69
# … with 69 variables: annot_transcript_id <chr>, annot_gene_id <chr>,
#   annot_gene_name <chr>, gene_novelty <chr>, transcript_novelty <chr>,
#   ISM_subtype <chr>, n_reads <dbl>, n_donors <dbl>,
#   dist_to_CAGE_peak.x <dbl>, within_CAGE_peak.x <lgl>,
#   dist_to_polyA_site.x <dbl>, within_polyA_site.x <lgl>, polyA_motif.x <chr>,
#   dist_to_polyA_motif <dbl>, polyA_motif_found.x <lgl>,
#   CAGE_support_100 <lgl>, CAGE_support_250 <lgl>, CAGE_support_500 <lgl>, …

sanity_check %>% filter(is.na(within_CAGE_peak.x) & !within_CAGE_peak.y)

# A tibble: 3,163 × 69
   annot_trans…¹ annot…² annot…³ gene_…⁴ trans…⁵ ISM_s…⁶ n_reads n_don…⁷ dist_…⁸
   <chr>         <chr>   <chr>   <chr>   <chr>   <chr>     <dbl>   <dbl>   <dbl>
 1 ENST00000070… ENSG00… PKP2    Known   Known   <NA>         NA      NA      NA
 2 ENST00000218… ENSG00… STAG2   Known   Known   <NA>         NA      NA      NA
 3 ENST00000238… ENSG00… ZC2HC1C Known   Known   <NA>         NA      NA      NA
 4 ENST00000245… ENSG00… RPL23   Known   Known   <NA>         NA      NA      NA
 5 ENST00000252… ENSG00… FUT5    Known   Known   <NA>         NA      NA      NA
 6 ENST00000254… ENSG00… UBE2L2  Known   Known   <NA>         NA      NA      NA
 7 ENST00000254… ENSG00… ZSWIM4  Known   Known   <NA>         NA      NA      NA
 8 ENST00000254… ENSG00… AL3919… Known   Known   <NA>         NA      NA      NA
 9 ENST00000261… ENSG00… FOXN3   Known   Known   <NA>         NA      NA      NA
10 ENST00000262… ENSG00… IKZF4   Known   Known   <NA>         NA      NA      NA
# … with 3,153 more rows, 60 more variables: within_CAGE_peak.x <lgl>,
#   dist_to_polyA_site.x <dbl>, within_polyA_site.x <lgl>, polyA_motif.x <chr>,
#   dist_to_polyA_motif <dbl>, polyA_motif_found.x <lgl>,
#   CAGE_support_100 <lgl>, CAGE_support_250 <lgl>, CAGE_support_500 <lgl>,
#   PAS_motif_support_35 <lgl>, PAS_motif_support_50 <lgl>,
#   PAS_motif_support_100 <lgl>, long_read_db <lgl>, chrom <chr>, strand <chr>,
#   length <dbl>, exons <dbl>, structural_category <chr>, …

support_for_plot = cts %>% select(transcript_id, transcript_novelty) %>%
  left_join(
    read_csv("data/within_CAGE/cpvz_refTSS_200_CAGE_results.csv") %>%
      mutate(CAGE_support = CAGE_support == "yes") %>%
      dplyr::rename(within_CAGE_refTSS = "CAGE_support"),
    by = c("transcript_id" = "transcript_ID")
  ) %>%
  left_join(
    read_csv("data/within_CAGE/cpvz_fetal_200_CAGE_results.csv") %>%
      mutate(CAGE_support = CAGE_support == "yes") %>%
      dplyr::rename(within_CAGE_fetal = "CAGE_support"),
    by = c("transcript_id" = "transcript_ID")
  ) %>%
  left_join(
    read_csv("data/within_ATAC/cpvz_Greenleaf_500_ATAC_results.csv") %>%
      mutate(CAGE_support = CAGE_support == "yes") %>%
      dplyr::rename(within_ATAC_Greenleaf = "CAGE_support"),
    by = c("transcript_id" = "transcript_ID")
  ) %>%
  left_join(
    read_csv("data/within_ATAC/cpvz_Nowakowski_500_ATAC_results.csv") %>%
      mutate(CAGE_support = CAGE_support == "yes") %>%
      dplyr::rename(within_ATAC_Nowakowski = "CAGE_support"),
    by = c("transcript_id" = "transcript_ID")
  ) %>%
  left_join(
    read_csv("data/within_ATAC/cpvz_LuisCP_500_ATAC_results.csv") %>%
      mutate(CAGE_support = CAGE_support == "yes") %>%
      dplyr::rename(within_ATAC_LuisCP = "CAGE_support"),
    by = c("transcript_id" = "transcript_ID")
  ) %>%
  left_join(
    read_csv("data/within_ATAC/cpvz_LuisVZ_500_ATAC_results.csv") %>%
      mutate(CAGE_support = CAGE_support == "yes") %>%
      dplyr::rename(within_ATAC_LuisGZ = "CAGE_support"),
    by = c("transcript_id" = "transcript_ID")
  ) %>%
  left_join(
    sqanti %>% select(isoform, within_polyA_site, polyA_motif_found),
    by = c("transcript_id" = "isoform")
  ) %>%
  mutate(across(!c("transcript_id", "transcript_novelty"), ~replace_na(., F)))

Rows: 214516 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): transcript_ID, CAGE_support

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 214516 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): transcript_ID, CAGE_support

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 214516 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): transcript_ID, CAGE_support

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 214516 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): transcript_ID, CAGE_support

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 214516 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): transcript_ID, CAGE_support

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 214516 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): transcript_ID, CAGE_support

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

support_for_plot

# A tibble: 214,516 × 10
   transcript_id trans…¹ withi…² withi…³ withi…⁴ withi…⁵ withi…⁶ withi…⁷ withi…⁸
   <chr>         <fct>   <lgl>   <lgl>   <lgl>   <lgl>   <lgl>   <lgl>   <lgl>  
 1 ENST00000494… Known   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE  
 2 ENST00000424… Known   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE  
 3 ENST00000445… Known   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE  
 4 ENST00000457… Known   TRUE    TRUE    TRUE    TRUE    FALSE   FALSE   TRUE   
 5 ENST00000414… Known   TRUE    FALSE   FALSE   FALSE   FALSE   FALSE   TRUE   
 6 ENST00000655… Known   FALSE   TRUE    TRUE    TRUE    FALSE   FALSE   FALSE  
 7 ENST00000457… Known   FALSE   TRUE    TRUE    TRUE    FALSE   FALSE   FALSE  
 8 ENST00000591… Known   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   TRUE   
 9 ENST00000644… Known   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE  
10 ENST00000445… Known   TRUE    TRUE    TRUE    TRUE    FALSE   FALSE   TRUE   
# … with 214,506 more rows, 1 more variable: polyA_motif_found <lgl>, and
#   abbreviated variable names ¹transcript_novelty, ²within_CAGE_refTSS,
#   ³within_CAGE_fetal, ⁴within_ATAC_Greenleaf, ⁵within_ATAC_Nowakowski,
#   ⁶within_ATAC_LuisCP, ⁷within_ATAC_LuisGZ, ⁸within_polyA_site

support_for_plot2 = support_for_plot %>%
  group_by(transcript_novelty) %>%
  summarize(
#    any_5p = sum(within_CAGE_refTSS | within_CAGE_fetal | within_ATAC_Greenleaf | within_ATAC_Nowakowski, na.rm = T),
    within_CAGE_peak = sum(within_CAGE_refTSS | within_CAGE_fetal, na.rm = T),
    within_ATAC_peak = sum(within_ATAC_Greenleaf | within_ATAC_Nowakowski | within_ATAC_LuisCP | within_ATAC_LuisGZ, na.rm = T),
#    any_3p = sum(within_polyA_site | polyA_motif_found, na.rm = T),
    within_polyA_site = sum(within_polyA_site, na.rm = T),
    polyA_motif_found = sum(polyA_motif_found, na.rm = T),
    n = n()
  ) %>%
  pivot_longer(!c("transcript_novelty", "n")) %>%
  mutate(prop = value / n) %>%
  mutate(end = if_else(name %in% c("within_polyA_site", "polyA_motif_found", "any_3p"), "3′-end support", "5′-end support")) %>%
  mutate(end = end %>% factor(levels = c("5′-end support", "3′-end support"))) %>%
  mutate(name = name %>% as_factor()) %>%
  mutate(transcript_novelty = transcript_novelty %>% fct_relevel("Known") %>% fct_other(drop = c("Antisense", "Intergenic", "Genomic")))
support_for_plot2

# A tibble: 28 × 6
   transcript_novelty     n name              value  prop end           
   <fct>              <int> <fct>             <int> <dbl> <fct>         
 1 ISM                83089 within_CAGE_peak  41387 0.498 5′-end support
 2 ISM                83089 within_ATAC_peak  45668 0.550 5′-end support
 3 ISM                83089 within_polyA_site 63833 0.768 3′-end support
 4 ISM                83089 polyA_motif_found 65760 0.791 3′-end support
 5 Known              65006 within_CAGE_peak  49495 0.761 5′-end support
 6 Known              65006 within_ATAC_peak  54138 0.833 5′-end support
 7 Known              65006 within_polyA_site 42187 0.649 3′-end support
 8 Known              65006 polyA_motif_found 41856 0.644 3′-end support
 9 NIC                50621 within_CAGE_peak  49442 0.977 5′-end support
10 NIC                50621 within_ATAC_peak  49674 0.981 5′-end support
# … with 18 more rows

ggplot(support_for_plot2, aes(x = prop, y = transcript_novelty, fill = fct_rev(name))) +
  geom_col(position = "dodge") +
  scale_y_discrete(limits = rev) +
  scale_fill_manual(
    limits = c("within_CAGE_peak", "within_ATAC_peak", "within_polyA_site", "polyA_motif_found"),
    labels = c("CAGE", "ATAC-seq", "polyA site", "polyA motif"),
    values = c("#00bfc4", "#00a9ff", "#c77cff", "#ff61cc")
  ) +
  labs(
    x = "Proportion of transcripts",
    y = NULL,
    fill = NULL
  ) +
  guides(
    fill = guide_legend(ncol = 2)
  ) +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    legend.position = "bottom",
    panel.spacing = unit(2, "lines"),
    plot.margin = margin(11, 11, 11, 11, "points")
  ) +
  facet_wrap(vars(end))

#ggsave("output/figures/Fig1_external_support_v8.png", width = 11, height = 8.5, units = "in", dpi = 300)
ggsave("output/figures/Fig2/Fig2B_external_support_v2.pdf", width = 8, height = 6, units = "in", dpi = 300, device=cairo_pdf)

tableS1 = talon_gtf %>%
  select(seqnames, start, end, strand, gene_id, gene_name, gene_status, gene_type, transcript_id, transcript_status, transcript_type) %>%
  left_join(
    cts %>% select(transcript_id, gene_novelty, transcript_novelty, ISM_subtype, n_exons, length),
    by = "transcript_id"
  ) %>%
  left_join(
    support_for_plot,
    by = c("transcript_id", "transcript_novelty")
  ) %>%
#  dplyr::rename(gencode_gene_type = "gene_type", gencode_transcript_type = "transcript_type") %>%
  relocate(gene_novelty, .after = gene_type) %>%
  mutate(
    within_CAGE_peak = within_CAGE_refTSS | within_CAGE_fetal,
    within_ATAC_peak = within_ATAC_Greenleaf | within_ATAC_Nowakowski | within_ATAC_LuisCP | within_ATAC_LuisGZ,
    .keep = "unused",
    .before = within_polyA_site
  )
tableS1

# A tibble: 214,516 × 20
   seqnames start   end strand gene_id   gene_…¹ gene_…² gene_…³ gene_…⁴ trans…⁵
   <fct>    <int> <int> <fct>  <chr>     <chr>   <chr>   <chr>   <fct>   <chr>  
 1 chr1     14404 29570 -      ENSG0000… WASH7P  KNOWN   unproc… Known   TALONT…
 2 chr1     14404 21859 -      ENSG0000… WASH7P  KNOWN   unproc… Known   TALONT…
 3 chr1     14404 29570 -      ENSG0000… WASH7P  KNOWN   unproc… Known   TALONT…
 4 chr1     14404 21119 -      ENSG0000… WASH7P  KNOWN   unproc… Known   TALONT…
 5 chr1     14404 29570 -      ENSG0000… WASH7P  KNOWN   unproc… Known   TALONT…
 6 chr1     14404 29570 -      ENSG0000… WASH7P  KNOWN   unproc… Known   TALONT…
 7 chr1     14404 29570 -      ENSG0000… WASH7P  KNOWN   unproc… Known   TALONT…
 8 chr1     14404 29570 -      ENSG0000… WASH7P  KNOWN   unproc… Known   TALONT…
 9 chr1     14404 29570 -      ENSG0000… WASH7P  KNOWN   unproc… Known   TALONT…
10 chr1     14404 29570 -      ENSG0000… WASH7P  KNOWN   unproc… Known   TALONT…
# … with 214,506 more rows, 10 more variables: transcript_status <chr>,
#   transcript_type <chr>, transcript_novelty <fct>, ISM_subtype <fct>,
#   n_exons <dbl>, length <dbl>, within_CAGE_peak <lgl>,
#   within_ATAC_peak <lgl>, within_polyA_site <lgl>, polyA_motif_found <lgl>,
#   and abbreviated variable names ¹gene_name, ²gene_status, ³gene_type,
#   ⁴gene_novelty, ⁵transcript_id

tableS1 %>%
  write_tsv("output/tables/TableS1_v5.tsv.gz")

Numbers for manuscript text:

tableS1 %>% count(transcript_status)

# A tibble: 2 × 2
  transcript_status      n
  <chr>              <int>
1 KNOWN              65006
2 NOVEL             149510

tableS1Novel1 = tableS1 %>%
  filter(transcript_novelty %in% c("NIC", "NNC"))

tableS1Novel1 %>%
  summarize(n = n(), n_genes = n_distinct(gene_id))

# A tibble: 1 × 2
      n n_genes
  <int>   <int>
1 65184   10175

tableS1Novel2 = tableS1 %>%
  filter(transcript_novelty %in% c("NIC", "NNC", "ISM"))

tableS1Novel2 %>%
  count(within_CAGE_peak, within_ATAC_peak) %>%
  mutate(prop = n/nrow(tableS1Novel2))

# A tibble: 4 × 4
  within_CAGE_peak within_ATAC_peak     n   prop
  <lgl>            <lgl>            <int>  <dbl>
1 FALSE            FALSE            28580 0.193 
2 FALSE            TRUE             15183 0.102 
3 TRUE             FALSE            10204 0.0688
4 TRUE             TRUE             94306 0.636

tableS1Novel2 %>% count(within_CAGE_peak) %>% mutate(prop = n/nrow(tableS1Novel2))

# A tibble: 2 × 3
  within_CAGE_peak      n  prop
  <lgl>             <int> <dbl>
1 FALSE             43763 0.295
2 TRUE             104510 0.705

tableS1Novel2 %>% count(within_ATAC_peak) %>% mutate(prop = n/nrow(tableS1Novel2))

# A tibble: 2 × 3
  within_ATAC_peak      n  prop
  <lgl>             <int> <dbl>
1 FALSE             38784 0.262
2 TRUE             109489 0.738

tableS1Novel2 %>%
  count(within_polyA_site, polyA_motif_found) %>%
  mutate(prop = n/nrow(tableS1Novel2))

# A tibble: 4 × 4
  within_polyA_site polyA_motif_found      n   prop
  <lgl>             <lgl>              <int>  <dbl>
1 FALSE             FALSE              13342 0.0900
2 FALSE             TRUE               13429 0.0906
3 TRUE              FALSE              12279 0.0828
4 TRUE              TRUE              109223 0.737

tableS1 %>%
  filter(transcript_novelty %in% c("Known", "ISM")) %>%
  count(transcript_novelty, within_polyA_site | polyA_motif_found)

# A tibble: 4 × 3
  transcript_novelty `within_polyA_site | polyA_motif_found`     n
  <fct>              <lgl>                                   <int>
1 ISM                FALSE                                    9997
2 ISM                TRUE                                    73092
3 Known              FALSE                                   15470
4 Known              TRUE                                    49536

Splice junctions from SQANTI:

sqantiJunc = read_tsv("data/sqanti/cp_vz_0.75_min_7_recovery_talon_junctions.txt.gz")

Rows: 1901871 Columns: 24
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (10): isoform, chrom, strand, junction_number, transcript_coord, junctio...
dbl  (9): genomic_start_coord, genomic_end_coord, diff_to_Ref_start_site, di...
lgl  (5): bite_junction, RTS_junction, indel_near_junct, phyloP_start, phylo...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

uniqueJunc = sqantiJunc %>%
  group_by(chrom, strand, genomic_start_coord, genomic_end_coord) %>%
  summarize(
    n = sum(intropolis.v1.hg19.tsv.min_count_10_unique),
    intropolis_support = n > 0,
    novel = all(junction_category == "novel"),
    canonical = all(canonical == "canonical"),
    .groups = "drop"
  )

uniqueJunc %>% count(intropolis_support) %>% mutate(prop = n/nrow(uniqueJunc))

# A tibble: 2 × 3
  intropolis_support      n   prop
  <lgl>               <int>  <dbl>
1 FALSE               10896 0.0471
2 TRUE               220360 0.953

uniqueJunc %>% count(novel) %>% mutate(prop = n/nrow(uniqueJunc))

# A tibble: 2 × 3
  novel      n  prop
  <lgl>  <int> <dbl>
1 FALSE 193141 0.835
2 TRUE   38115 0.165

uniqueJunc %>% filter(novel) %>% count(intropolis_support) %>% mutate(prop = n/nrow(uniqueJunc %>% filter(novel)))

# A tibble: 2 × 3
  intropolis_support     n  prop
  <lgl>              <int> <dbl>
1 FALSE               9862 0.259
2 TRUE               28253 0.741

Problematic spliceJns.txt gives us 57.5% novel SJs:

gencodeJuncOld = read_tsv("data/splice_junctions/spliceJns.problematic.txt", col_names = c("chrom", "genomic_start_coord", "genomic_end_coord", "strand")) %>%
  select(!starts_with("X")) %>%
  mutate(strand = case_when(strand == 1 ~ "+", strand == 2 ~ "-")) %>%
  mutate(novel2 = FALSE)

Rows: 474236 Columns: 9
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): chrom
dbl (5): genomic_start_coord, genomic_end_coord, strand, X5, X6
lgl (3): X7, X8, X9

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

uniqueJunc2 = uniqueJunc %>% left_join(gencodeJuncOld) %>% mutate(novel2 = replace_na(novel2, TRUE))

Joining, by = c("chrom", "strand", "genomic_start_coord", "genomic_end_coord")

uniqueJunc2 %>% count(novel2) %>% mutate(prop = n/nrow(uniqueJunc2))

# A tibble: 2 × 3
  novel2      n  prop
  <lgl>   <int> <dbl>
1 FALSE   98334 0.425
2 TRUE   132922 0.575

Updated spliceJns.txt agrees with SQANTI:

gencodeJunc = read_tsv("data/splice_junctions/spliceJns.exon_reorder.txt", col_names = c("chrom", "genomic_start_coord", "genomic_end_coord", "strand")) %>%
  select(!starts_with("X")) %>%
  mutate(strand = case_when(strand == 1 ~ "+", strand == 2 ~ "-")) %>%
  mutate(novel2 = FALSE)

Rows: 383616 Columns: 9
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): chrom
dbl (5): genomic_start_coord, genomic_end_coord, strand, X5, X6
lgl (3): X7, X8, X9

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

uniqueJunc2 = uniqueJunc %>% left_join(gencodeJunc) %>% mutate(novel2 = replace_na(novel2, TRUE))

Joining, by = c("chrom", "strand", "genomic_start_coord", "genomic_end_coord")

uniqueJunc2 %>% count(novel2) %>% mutate(prop = n/nrow(uniqueJunc2))

# A tibble: 2 × 3
  novel2      n  prop
  <lgl>   <int> <dbl>
1 FALSE  192904 0.834
2 TRUE    38352 0.166

Joining with intropolis junctions also agrees with SQANTI:

intropJunc = read_tsv("ref/intropolis/intropolis_v1_hg19_2samples_10counts_starSJout.tsv.gz", col_names = c("chrom", "genomic_start_coord", "genomic_end_coord", "strand")) %>%
  select(!starts_with("X")) %>%
  mutate(strand = case_when(strand == 1 ~ "+", strand == 2 ~ "-")) %>%
  mutate(intropolis_support2 = TRUE)

Rows: 4974342 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): chrom
dbl (8): genomic_start_coord, genomic_end_coord, strand, X5, X6, X7, X8, X9
lgl (1): X10

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

uniqueJunc2 = uniqueJunc %>%
  left_join(intropJunc) %>%
  mutate(intropolis_support2 = replace_na(intropolis_support2, FALSE))

Joining, by = c("chrom", "strand", "genomic_start_coord", "genomic_end_coord")

uniqueJunc2 %>% count(intropolis_support2) %>% mutate(prop = n/nrow(uniqueJunc2))

# A tibble: 2 × 3
  intropolis_support2      n   prop
  <lgl>                <int>  <dbl>
1 FALSE                10897 0.0471
2 TRUE                220359 0.953