options(stringsAsFactors = F)
options(ucscChromosomeNames = F)
suppressMessages({
library(data.table)
library(tidyverse)
library(rtracklayer)
library(GenomicFeatures)
library(GenomicRanges)
library(plyranges)
# devtools::install_github("mskilab/gUtils")
library(gUtils)
})
= c(
colorVector "Known" = "#009E73",
"ISM" = "#0072B2",
"ISM_Prefix" = "#005996",
"ISM_Suffix" = "#378bcc",
"NIC" = "#D55E00",
"NNC" = "#E69F00",
"Other" = "#000000"
)= colorVector[-2]
colorVector_ismSplit
# Gencode v33 Annotations
="ref/gencode.v33lift37.annotation.gtf.gz"
gencode= rtracklayer::import(gencode)
gr.gencode #txdb.gencode = makeTxDbFromGRanges(gr.gencode)
# Isoseq Annotations
="data/cp_vz_0.75_min_7_recovery_talon.gtf.gz"
isoseq= rtracklayer::import(isoseq)
gr.isoseq #txdb.isoseq = makeTxDbFromGRanges(gr.isoseq)
Figure 2 - Novel Exons
Ashok’s code using bash and bedtools
gzcat ref/gencode.v33lift37.annotation.gtf.gz | awk '{if($3=="exon") { for(i=1;i<=NF;i++)if($i~/(exon_id)/){ print $1"\t"$4"\t"$5"\t"$7"\t"$16"\t"$i"\t"$(i+1)}}}' | sed 's/[";]//g' | awk '{print $1"\t"$2"\t"$3"\t"$5"_"$6"_"$7"\t""1000""\t"$4}' | awk '!seen[$0]++' > data/working/novel_exons/All_Gencode_Exon.bed
gzcat data/cp_vz_0.75_min_7_recovery_talon.gtf.gz | awk '{if($3=="exon") { for(i=1;i<=NF;i++)if($i~/(exon_status|talon_exon|gene_name)/){ printf "%s%s",(!c++? "":FS),$i"\t"$(i+ 1) }; print "\t"$1"\t"$4"\t"$5"\t"$7; c=0 }
}' | sed 's/[";]//g' | awk '{if($6=="NOVEL") print $7"\t"$8"\t"$9"\t"$2"_"$3"_"$4"\t""1000""\t"$NF}' | awk '!seen[$0]++' > data/working/novel_exons/All_Novel_Exon.bed
bedtools intersect -a data/working/novel_exons/All_Novel_Exon.bed -b data/working/novel_exons/All_Gencode_Exon.bed -v > data/working/novel_exons/No_genecode_ovelap_novel_exon.bed
sort -k 1,1 -k2,2n data/working/novel_exons/All_Novel_Exon.bed > data/working/novel_exons/All_Novel_Exon_sorted.bed
sort -k 1,1 -k2,2n data/working/novel_exons/All_Gencode_Exon.bed > data/working/novel_exons/All_Gencode_Exon_sorted.bed
bedtools multiinter -header -names Novel Gencode -i data/working/novel_exons/All_Novel_Exon_sorted.bed data/working/novel_exons/All_Gencode_Exon_sorted.bed > data/working/novel_exons/bulk_NewCoding.txt
***** WARNING: File data/working/novel_exons/All_Novel_Exon.bed has inconsistent naming convention for record:
GL000204.1 56744 57007 TALONG000062493_talon_exon_1018404 1000 -
***** WARNING: File data/working/novel_exons/All_Novel_Exon.bed has inconsistent naming convention for record:
GL000204.1 56744 57007 TALONG000062493_talon_exon_1018404 1000 -
#Mike’s code using R
#How many distinct GENCODE exons?
= gr.gencode %>% as_tibble() %>% filter(type=='exon') %>%
exons.gencode_unique ::select(seqnames, start, end, exon_id)
dplyr$seqnames=as.character(exons.gencode_unique$seqnames)
exons.gencode_uniquelength(unique(exons.gencode_unique$exon_id)) #748355 unique exon IDs in gencode
[1] 748355
$coord = paste0(exons.gencode_unique$seqnames, ":", exons.gencode_unique$start, "-", exons.gencode_unique$end)
exons.gencode_uniquelength(unique(exons.gencode_unique$coord)) #634758 unique chr-start-stop in gencode
[1] 634758
= unique(gr.gencode[gr.gencode$type=='exon',])
exons.gencode length(exons.gencode)
[1] 634771
= read.table("data/working/novel_exons/All_Gencode_Exon_sorted.bed")
ashok.gencodedim(ashok.gencode) # 750284 in Ashok's bed file
[1] 750284 6
# How many distinct IsoSeq exons? ~83k
= gr.isoseq %>% as_tibble() %>% filter(type=='exon',exon_status=="NOVEL") %>% dplyr::select(seqnames, start, end, exon_id) %>% mutate(coord=paste0(as.character(seqnames), ":", start, "-", end))
exons.isoseq_unique length(unique(exons.isoseq_unique$exon_id)) #334861 by exon_ID in Talon
[1] 82882
length(unique(exons.isoseq_unique$coord)) #334774 by coordinates
[1] 82850
= read.table("data/working/novel_exons/All_Novel_Exon.bed")
ashok.isoseq dim(ashok.isoseq) #83153 in Ashok's bed file
[1] 83153 6
= unique(gr.isoseq[gr.isoseq$type=="exon",])
exons.isoseq
# Find compltely non-overlapping exons
## Used two methods, both give 7039 unique exons in TALON not overlapping in Gencode -- from 3549 genes
<- exons.isoseq %>% filter_by_non_overlaps(exons.gencode, minoverlap = 2L) %>% filter(grepl("chr", seqnames))
novel
length(novel) #7039 novel exons
[1] 7039
length(unique(novel$gene_id)) #3551 genes with novel exons
[1] 3551
sum(width(novel %>% reduce_ranges())) #3849462 bp --> 3.85MB of novel exons
[1] 3849462
= subsetByOverlaps(exons.isoseq, exons.gencode,invert = T,type='any',ignore.strand=T, minoverlap = 2L)
novel2 = novel2[grepl('chr',seqnames(novel2))]
novel2 length(unique(novel2$gene_name))
[1] 3549
= novel2 %>% as_tibble()
these_novel
write.csv(these_novel, file="data/working/novel_exons/mike_novel.csv")
## Write GTF for novel exons, and control (strand flipped exons)
export(novel2,"data/working/novel_exons/novel_exons_mike.gff",format = "gff")
export(gr.flipstrand(novel2),"data/working/novel_exons/novel_exons_strandflip_mike.gff",format = "gff")
= read.table("data/working/novel_exons/No_genecode_ovelap_novel_exon.bed")
ashok.novel = ashok.novel[grepl("chr",ashok.novel$V1),]
ashok.novel dim(ashok.novel) # Ashok gets 7041 isoforms in 3542 genes
[1] 7041 6
= unique(unlist(lapply(strsplit(ashok.novel$V4, "_"),'[',1)))
ashok.novel.genes
length(intersect(ashok.novel.genes, novel2$gene_name))
[1] 3539
!ashok.novel.genes %in% novel2$gene_name] ashok.novel.genes[
[1] "SLC35A1" "PPP1R32" "AC007262.2"
$gene_name[!novel2$gene_name %in% ashok.novel.genes] novel2
[1] "PINK1" "ERI3" "RBM5" "B3GALNT1"
[5] "PDK4" "SMARCD3" "RBM17" "ABLIM1"
[9] "DDB1" "TALONG000089386"
%>% head() these_novel
# A tibble: 6 × 54
seqnames start end width strand source type score phase gene_id gene_…¹
<fct> <int> <int> <int> <fct> <fct> <fct> <dbl> <int> <chr> <chr>
1 chr1 18913 19139 227 - TALON exon NA NA ENSG0000… WASH7P
2 chr1 18913 20286 1374 - TALON exon NA NA ENSG0000… WASH7P
3 chr1 18913 21119 2207 - TALON exon NA NA ENSG0000… WASH7P
4 chr1 18913 20960 2048 - TALON exon NA NA ENSG0000… WASH7P
5 chr1 18913 19416 504 - TALON exon NA NA ENSG0000… WASH7P
6 chr1 840675 841059 385 + TALON exon NA NA ENSG0000… AL6456…
# … with 43 more variables: gene_status <chr>, gene_type <chr>,
# talon_gene <chr>, havana_gene <chr>, hgnc_id <chr>, level <chr>,
# remap_num_mappings <chr>, remap_status <chr>, remap_target_status <chr>,
# transcript_id <chr>, transcript_status <chr>, transcript_name <chr>,
# talon_transcript <chr>, NNC_transcript <chr>, exon_number <chr>,
# exon_id <chr>, talon_exon <chr>, exon_status <chr>, ont <chr>,
# remap_original_location <chr>, source.1 <chr>, tag <chr>, …
$protein_coding = these_novel$gene_type=="protein_coding"
these_novel$protein_coding[is.na(these_novel$protein_coding)] = F
these_novel$protein_coding = factor(these_novel$protein_coding, levels=c(TRUE,FALSE))
these_novel= ggplot(these_novel, aes(x=width,fill=protein_coding)) + geom_histogram(alpha=.5) + scale_x_log10(limits=c(10,10000)) + theme_bw() +
Fig2_exonWidth labs(x="Novel exon width (bp)")
Fig2_exonWidth
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 62 rows containing non-finite values (stat_bin).
Warning: Removed 4 rows containing missing values (geom_bar).
ggsave(Fig2_exonWidth, file="output/figures/Fig2/Fig2_exonWidth.pdf",width=4,height=3)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 62 rows containing non-finite values (stat_bin).
Removed 4 rows containing missing values (geom_bar).