options(stringsAsFactors = F)
options(ucscChromosomeNames = F)
suppressMessages({
library(data.table)
library(tidyverse)
library(IsoformSwitchAnalyzeR)
library(rtracklayer)
library(ggrepel)
library(scales)
library(GenomicFeatures)
library(GenomicRanges)
library(GenomicInteractions)
library(Gviz)
})
= c(
colorVector "Known" = "#009E73",
"ISM" = "#0072B2",
"ISM_Prefix" = "#005996",
"ISM_Suffix" = "#378bcc",
"NIC" = "#D55E00",
"NNC" = "#E69F00",
"Other" = "#000000"
)= colorVector[-2] colorVector_ismSplit
Figure 2 - Plot One Gene
if(!file.exists('data/working/locusPlot_workingData.RData')) {
# CAGE tracks
=AnnotationTrack(range = "ref/CAGE/hg19.cage_peak_phase1and2combined_coord.bed",
cagebackground.panel = "#99d8c9",
fill="#fc9272",
name = "CAGE peaks",
col.line="#99d8c9",
background.title="#2ca25f",
fontcolor.title="black")
# Intropolis junctions
=read.delim("ref/intropolis/intropolis_v1_hg19_2samples_10counts_starSJout.tsv.gz",header = F)
jc= jc[jc$V7>100,]
jc= GRanges(jc$V1, IRanges(jc$V2 + 1, width=5))
anchor.one = GRanges(jc$V1, IRanges(jc$V3, width=5))
anchor.two = log2(jc$V7)
interaction_counts
=GenomicInteractions(anchor.one,anchor.two,interaction_counts)
jc_object=InteractionTrack(jc_object,name = "Intropolis junctions")
jc_trackdisplayPars(jc_track)=list(background.panel = "#fee0d2",
col.interactions ="#6a51a3", #"#43a2ca",
col.anchors.line = "gray",
col.anchors.line = "gray",
lwd=0.6,
fontcolor.title="black",
background.title="#de2d26",
plot.outside = FALSE)
# Gencode v33 Annotations
="ref/gencode.v33lift37.annotation.gtf.gz"
gencode=makeTxDbFromGFF(gencode, format="gtf")
gencode_txdb=exonsBy(gencode_txdb,by="tx",use.names=T)
gencode_transcript= rtracklayer::import(gencode) %>% as_tibble()
gr.gencode
# Isoseq Annotations
="data/sqanti/cp_vz_0.75_min_7_recovery_talon_corrected.gtf.cds.gff.gz"
isoseq=makeTxDbFromGFF(isoseq, format="gtf")
isoseq_txdb=exonsBy(isoseq_txdb,by="tx",use.names=T)
isoseq_transcript= rtracklayer::import(isoseq) %>% as_tibble()
gr.isoseq
= read_table("data/cp_vz_0.75_min_7_recovery_talon_abundance_filtered.tsv.gz")
cts $novelty2 = as.character(cts$transcript_novelty)
cts$novelty2[which(cts$novelty2=="ISM" & cts$ISM_subtype=="Prefix")] = "ISM_Prefix"
cts$novelty2[which(cts$novelty2=="ISM" & cts$ISM_subtype=="Suffix")] = "ISM_Suffix"
cts$novelty2[cts$novelty2 %in% c("Antisense", "Genomic", "Intergenic", "ISM")] = "Other"
cts$novelty2 = factor(cts$novelty2,levels=c("Known", "ISM_Prefix", "ISM_Suffix", "NIC", "NNC", "Other"))
cts$counts = rowSums(cts[,c(12:35)])
cts$cpm = cts$counts / (sum(cts$counts)/1000000)
cts
= gr.isoseq
gr.isoseq.old
<- gr.isoseq.old %>% left_join(cts, by=c("transcript_id" = "annot_transcript_id"))
gr.isoseq
save.image('data/working/locusPlot_workingData.RData')
else {
} load('data/working/locusPlot_workingData.RData')
}
Import genomic features from the file as a GRanges object ... OK
Prepare the 'metadata' data frame ... OK
Make the TxDb object ...
Warning in .get_cds_IDX(mcols0$type, mcols0$phase): The "phase" metadata column contains non-NA values for features of type
stop_codon. This information was ignored.
Warning in .reject_transcripts(bad_tx, because): The following transcripts were dropped because they have incompatible
CDS and stop codons: ENST00000422803.2_2, ENST00000618549.1_2,
ENST00000619291.4_2, ENST00000621077.1_2, ENST00000621229.1_2,
ENST00000631326.2_2
OK
Import genomic features from the file as a GRanges object ... OK
Prepare the 'metadata' data frame ... OK
Make the TxDb object ...
Warning in .get_cds_IDX(mcols0$type, mcols0$phase): some CDS phases are missing
or not between 0 and 2
OK
── Column specification ────────────────────────────────────────────────────────
cols(
.default = col_double(),
annot_gene_id = col_character(),
annot_transcript_id = col_character(),
annot_gene_name = col_character(),
annot_transcript_name = col_character(),
gene_novelty = col_character(),
transcript_novelty = col_character(),
ISM_subtype = col_character()
)
ℹ Use `spec()` for the full column specifications.
Plot One Gene
= rtracklayer::import("data/cp_vz_0.75_min_7_recovery_talon.gtf.gz") %>%
isoseq.gene.names as_tibble() %>%
::filter(type == "gene") %>%
dplyr::select(gene_id, gene_name)
dplyr
= gr.isoseq %>% left_join(isoseq.gene.names) gr.isoseq
Joining, by = "gene_id"
= as_tibble(gr.gencode) %>% filter(type=="gene")
annot.gene
="UBE3A"
current_gene= data.frame(V1 = current_gene, V2 = unique(na.omit(gr.gencode$transcript_id[which(gr.gencode$gene_name==current_gene)])), V3="Gencode")
genes_to_plot
= rbind(genes_to_plot, data.frame(V1 = current_gene, V2 = unique(na.omit(gr.isoseq$transcript_id[which(gr.isoseq$gene_name==current_gene)])), V3="Isoseq"))
genes_to_plot = genes_to_plot[!apply(is.na(genes_to_plot), 1,any),]
genes_to_plot
=base::setdiff(with(genes_to_plot,V2[V1 == current_gene & V3=="Gencode"]) , with(genes_to_plot,V2[V1 == current_gene & V3=="Isoseq"]))
gencode_only
=base::intersect(with(genes_to_plot,V2[V1 == current_gene & V3=="Gencode"]) , with(genes_to_plot,V2[V1 == current_gene & V3=="Isoseq"]))
shared
=base::setdiff(with(genes_to_plot,V2[V1 == current_gene & V3=="Isoseq"]) , with(genes_to_plot,V2[V1 == current_gene & V3=="Gencode"]))
novel=novel[grepl("^TALON",novel)]
novel
# gencode tracks
=gencode_transcript[gencode_only,]
gencode_only_transcript_onegene=unlist(gencode_only_transcript_onegene)
gencode_only_transcript_onegeneelementMetadata(gencode_only_transcript_onegene)$transcript <- names(gencode_only_transcript_onegene)
=GeneRegionTrack(gencode_only_transcript_onegene,group = "transcript",name = "Gencode undetected")
gencode_only_track
=gencode_transcript[shared,]
shared_transcript=unlist(shared_transcript)
shared_transcriptelementMetadata(shared_transcript)$transcript=names(shared_transcript)
=GeneRegionTrack(shared_transcript,group = "transcript",name = "Gencode detected")
shared_trackdisplayPars(gencode_only_track)=list(stacking="squish",
background.panel = "#ffffb2",
fill="#006d2c",
col="#006d2c",
lwd=0.3,
col.line="black",
fontcolor.title="black",
background.title="#bd0026")
displayPars(shared_track)=list(stacking="squish",
background.panel = "#f1eef6",
fill="#d62424",
col="#d62424",
lwd=0.3,
col.line="black",
fontcolor.title="black",
background.title="#045a8d")
# isoseq tracks
=isoseq_transcript[novel,]
isoseq_transcript_onegene=unlist(isoseq_transcript_onegene)
isoseq_transcript_onegeneelementMetadata(isoseq_transcript_onegene)$transcript=names(isoseq_transcript_onegene)
seqlevels(isoseq_transcript_onegene) = seqlevels(isoseq_transcript_onegene)[1:25]
=GeneRegionTrack(isoseq_transcript_onegene,group = "transcript",name = "Novel", u=FALSE)
isoseq_trackdisplayPars(isoseq_track)=list(stacking="squish",
background.panel = "#bcbddc",
fill="#171cc7",
col="#171cc7",
lwd=0.3,
col.line="black",
showId = FALSE,
transcriptAnnotation = "transcript",
fontcolor.title="black",
background.title="#7a0177")
# universal tracks
<- GenomeAxisTrack()
axisTrack <- IdeogramTrack(genome = "hg19", chromosome = unique(seqnames(gencode_only_track)))
ideoTrack
=min(c(gencode_only_transcript_onegene@ranges@start,shared_transcript@ranges@start,isoseq_transcript_onegene@ranges@start))
leftmost=max(c(gencode_only_transcript_onegene@ranges@start,shared_transcript@ranges@start,isoseq_transcript_onegene@ranges@start))
rightmost=(rightmost - leftmost)*0.05
extraplotTracks(list(ideoTrack,axisTrack,jc_track,cage,gencode_only_track,shared_track,isoseq_track),
chromosome = unique(seqnames(isoseq_track)),
from = leftmost - extra,
to = rightmost + extra,
sizes = c(1.5,1.5,1.5,1.5,5, max(1.5,0.35*length(shared)),max(2,0.3*length(novel))))
#devtools::install_github("dzhang32/ggtranscript")
library(ggtranscript)
="UBE3A"
this_gene
<- gr.isoseq %>% dplyr::filter(annot_gene_name == this_gene & type == "exon" & (counts > 10 | novelty2=="Known"))
these_exons <- gr.isoseq %>% dplyr::filter(annot_gene_name == this_gene & type == "CDS" & (counts > 10 | novelty2=="Known"))
this_cds
<-these_exons %>%
g1ggplot(aes(
xstart = start,
xend = end,
y = reorder(annot_transcript_name, counts)
+
)) geom_range(
aes(fill = log2(1+cpm), group=novelty2), height=.25) +
geom_range(data=this_cds, aes(fill = log2(1+cpm), group=novelty2)) +
geom_intron(
data = to_intron(these_exons, "annot_transcript_name"),
aes(strand = strand),arrow.min.intron.length = 500,
arrow = grid::arrow(ends = "last", length = grid::unit(0.1, "inches")),
color='grey60',
+ facet_grid(novelty2~.,scale='free',space='free') + theme_bw() + labs(y="") + ggtitle(this_gene,subtitle = unique(these_exons$seqnames)) + theme(plot.title = element_text(hjust=.5), plot.subtitle = element_text(hjust=.5))
) g1
ggsave(g1, file=paste0("output/figures/switch_plots/ggtranscript_", this_gene,".pdf"),height=10, width=12)
#
# these_exons <- gr.isoseq %>% dplyr::filter(annot_gene_name == this_gene & (type == "exon" | type=="CDS")& (counts > 100 | novelty2=="Known"))
#
# this_gene_rescaled <- shorten_gaps(
# these_exons,
# to_intron(these_exons %>% filter(type=="exon"), "annot_transcript_name"),
# group_var = "annot_transcript_name"
# )
#
# this_cds_rescaled <- shorten_gaps(
# this_cds,
# to_intron(this_cds, "annot_transcript_name"),
# group_var = "annot_transcript_name"
# )
#
# this_gene_rescaled %>%
# dplyr::filter(type == "exon") %>%
# ggplot(aes(
# xstart = start,
# xend = end,
# y = annot_transcript_name
# )) +
# geom_range(
# aes(fill = log(1+cpm)), height=.25) +
# geom_range(data=this_gene_rescaled %>% filter(type=="CDS"),aes(fill = log(1+cpm)))+
# geom_intron(
# data = this_gene_rescaled %>% dplyr::filter(type == "intron"),
# arrow.min.intron.length = 200, aes(strand=strand)
# )+ facet_grid(novelty2~.,scale='free',space='free') + theme_bw() + labs(y="")