[74]:
library(ComplexHeatmap)
library(stringr)
library(ggplot2)
library(RColorBrewer)
library(cowplot)
library(pheatmap)
library(dplyr)
library(xlsx)
library(readxl)

set.seed(2000)

cols<-brewer.pal(11,"PuOr")
pal<-colorRampPalette(cols)
mycolors1<-pal(42)

cols2<-brewer.pal(9,"YlGnBu")
pal2<-colorRampPalette(cols2)
mycolors2<-pal2(42)

ALL data PCA#

1.2 RNA FPKM PCA#

[75]:
fpkm <- read_excel("fpkm.annot.xlsx")
[76]:
select_samples <- function(tomato_FPKM_data){
    fpkm2 <- data.frame(tomato_FPKM_data)
    rownames(fpkm2) <- fpkm2$ID
    fpkm2 <- fpkm2[2:175]
    return(fpkm2)
}
fpkm <- select_samples(fpkm)

[77]:
head(fpkm,2)
A data.frame: 2 × 174
CD0h.1CD0h.2CD0h.3C0h.1C0h.2C0h.3D0h.1D0h.2D0h.3CC12h.1DD216h.3CD216h.1CD216h.2CD216h.3C216h.1C216h.2C216h.3D216h.1D216h.2D216h.3
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
Solyc00g011560.10.090.1300000.440.190.2600.350.420.50.090000.240.670.04
Solyc00g011660.10.000.0000000.000.000.0000.000.000.00.050000.000.000.00
[78]:
get_metadata <- function(samples){
    Var1 <- data.frame(str_split_fixed(samples, "h.", 2))

    Var1$Time <- as.numeric(gsub('[^0-9.]','', Var1$X1))
    Var1$Condition <- gsub("^([[:alpha:]]*).*", "\\1", Var1$X1)
    rownames(Var1) <- samples
    Var1 <- Var1[,c("X2", "Time", "Condition")]
    colnames(Var1) <- c("rep", "Time", "Condition")

    return(Var1)
}

fpkm_metadata <- get_metadata(colnames(fpkm))
head(fpkm_metadata, 4)
A data.frame: 4 × 3
repTimeCondition
<chr><dbl><chr>
CD0h.110CD
CD0h.220CD
CD0h.330CD
C0h.110C
[138]:
PCA_fig <- function(pcaData, Pca.Var.Per, pc1, pc2, color="Time", shape="Condition", title="FPKM PCA"){
    p_PCA <- ggplot(data=pcaData,aes_string(x=paste0("PC",pc1),y=paste0("PC",pc2),color=color,shape=shape))+
      geom_point(size=2) +
      theme_bw()+theme(panel.grid=element_blank()) +
      xlab(paste("PC",pc1,"(",Pca.Var.Per[pc1],"%","variance)",sep="")) +
      ylab(paste("PC",pc2,"(",Pca.Var.Per[pc2],"%","variance)",sep="")) +
      scale_color_gradient2() +
      theme(legend.key.size=unit(4,'mm'), text = element_text(size=12)) +
      coord_equal() +
      labs(title=title)

    return(p_PCA)
}


[ ]:

[141]:
doRNApca <- function(df, meta_data, vars_N=3000){
    data <- as.data.frame(lapply(df, as.numeric))
    rownames(data) <- rownames(df)
    data[is.na(data)] <- 0
    data <- data[which(rowSums(data) > 0),]

    cg=names(tail(sort(apply(data, 1 , sd)), vars_N))
    #apply按行('1'是按行取,'2'是按列取)取每一行的方差,从小到大排序,取最大的1000个
    data <- data[cg,]

    ## PCA
    pca <- prcomp(t(data), scale=TRUE)
    pca.var <- pca$sdev^2
    pca.var.per <- round(pca.var/sum(pca.var)*100, 1)
    PCAmat <- as.data.frame(pca$x[,1:3])

    PCAmat <- cbind(PCAmat, meta_data)
    PCAmat <- cbind(PCAmat, t(data))
    #PCAmat$HAG <- factor(PCAmat$HAG, levels=sort(unique(PCAmat$HAG)))
    #PCAmat$HAG <- as.numeric(PCAmat$HAG)
    #PCAmat$HAG <- as.numeric(as.character(PCAmat$HAG))
    return(list(PCAmat=PCAmat, pca.var.per = pca.var.per))
}

fpkm_PCA <- doRNApca(fpkm, fpkm_metadata, vars_N=3000)

[142]:
head(fpkm_PCA$PCAmat,2)
A data.frame: 2 × 3006
PC1PC2PC3repTimeConditionSolyc01g104030.3Solyc09g007540.3Solyc10g050440.2Solyc01g074000.3Solyc03g119210.1Solyc01g109660.2Solyc08g074480.1Solyc12g044330.2Solyc05g008895.1Solyc08g078870.3Solyc03g034370.1Solyc12g094620.3Solyc09g010800.5.1Solyc09g075210.3
<dbl><dbl><dbl><chr><dbl><chr><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
CD0h.1-1.820171-13.2530832.1394710CD29.4976.7425.2119.93288.892229.743012.131984.41285.081244.82747.196857.774288.54758.06
CD0h.2 1.672188-10.7943530.6439820CD37.0572.7247.3720.98307.102121.112983.492294.18324.881349.20750.097503.054321.57680.50
[145]:
p_pca1 <- PCA_fig(fpkm_PCA$PCAmat, fpkm_PCA$pca.var.per, 1, 2, color="Time", shape="Condition", title="FPKM PCA")
p_pca2 <- PCA_fig(fpkm_PCA$PCAmat, fpkm_PCA$pca.var.per, 1, 3, color="Time", shape="Condition", title="FPKM PCA")

options(repr.plot.width=10, repr.plot.height=4)
plot_grid(p_pca1, p_pca2, ncol=2)


ggsave(p_pca1, file='fig/RNA_PCA12.pdf', width=5, height=5)
ggsave(p_pca2, file='fig/RNA_PCA13.pdf', width=5, height=5)
../../_images/notebooks_dataQC_Fig1_pca_10_0.png

1.2 sugar#

[107]:
sugar <- read_excel("sugar.levels.xlsx")
rownames(sugar) <- sugar$Index
sugar <- data.frame(sugar)
sugar_data <- sugar[c(1:3,5:13),13:186]
sugar_SugarMetadata <- sugar[c(1:3,5:13),c(2,4)]


sugarSampleMetadata <- function(samples){
    Var1 <- data.frame(str_split_fixed(samples, "_", 2))$X2
    Var1 <- data.frame(str_split_fixed(Var1, "h.", 2))

    Var1$Time <- as.numeric(gsub('[^0-9.]','', Var1$X1))
    Var1$Condition <- gsub("^([[:alpha:]]*).*", "\\1", Var1$X1)
    rownames(Var1) <- samples
    Var1 <- Var1[,c("X2", "Time", "Condition")]
    colnames(Var1) <- c("rep", "Time", "Condition")

    return(Var1)
}

SugarMetaData <- suppressWarnings(sugarSampleMetadata(colnames(sugar_data)))
head(SugarMetaData, 2)

Warning message:
“Setting row names on a tibble is deprecated.”
A data.frame: 2 × 3
repTimeCondition
<chr><dbl><chr>
S_C0h.110C
S_C0h.220C
[108]:
head(sugar_data,2)
head(sugar_SugarMetadata, 2)
A data.frame: 2 × 174
S_C0h.1S_C0h.2S_C0h.3S_C120h.1S_C120h.2S_C120h.3S_C12h.1S_C12h.2S_C12h.3S_C168h.1S_DD60h.3S_DD72h.1S_DD72h.2S_DD72h.3S_DD84h.1S_DD84h.2S_DD84h.3S_DD96h.1S_DD96h.2S_DD96h.3
<chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr>
Suc8.42546356 10.00109 7.49594127 16.939456317.6777468 18.4620683 18.9145304 19.9808041 17.2351266 26.9638331 18.775229416.588218812.674543717.097063122.9579815 21.7961478 17.8414553 22.6884751 19.8810398 20.3659981
Mal0.0227022310.02151876610.031376193N/A 0.006974325580.00759573940.01528362040.03656185570.01293570950.0370845956N/A N/A N/A N/A 0.007788012360.006872982820.005652133730.01096909910.01041396530.00879881566
A data.frame: 2 × 2
CompoundsClass
<chr><chr>
SucSucrosedisaccharide
MalMaltosedisaccharide
[146]:
doSugarPCA <- function(df, meta_data){
    data <- as.data.frame(lapply(df, as.numeric))
    rownames(data) <- rownames(df)

    data[is.na(data)] <- 0
    data <- data[which(rowSums(data) > 0),]

    ## PCA
    pca <- prcomp(t(data), scale=TRUE)
    pca.var <- pca$sdev^2
    pca.var.per <- round(pca.var/sum(pca.var)*100, 1)
    PCAmat <- as.data.frame(pca$x[,1:3])

    PCAmat <- cbind(PCAmat, meta_data)
    PCAmat <- cbind(PCAmat, t(data))
    return(list(PCAmat=PCAmat, pca.var.per = pca.var.per))
}

sugar_PCA <- suppressWarnings(doSugarPCA(sugar_data, SugarMetaData))
[147]:
p_pca1 <- PCA_fig(sugar_PCA$PCAmat, sugar_PCA$pca.var.per, 1, 2, color="Time", shape="Condition", title="sugar PCA")
p_pca2 <- PCA_fig(sugar_PCA$PCAmat, sugar_PCA$pca.var.per, 1, 3, color="Time", shape="Condition", title="sugar PCA")

options(repr.plot.width=10, repr.plot.height=4)
plot_grid(p_pca1, p_pca2, ncol=2)


ggsave(p_pca1, file='fig/sugar_PCA12.pdf', width=5, height=5)
ggsave(p_pca2, file='fig/sugar_PCA13.pdf', width=5, height=5)
../../_images/notebooks_dataQC_Fig1_pca_15_0.png
[ ]:

[ ]:

[ ]:

1.3 energy PCA#

[134]:
energy <- read_excel("energy.levels.xlsx")
energy <- data.frame(energy)
rownames(energy) <- energy$Index
energy_vars <- rownames(energy)[!(rownames(energy) %in% c("L-Cystine","Succinyl-CoA","Phenyllactate", "Oxaloacetate", "c-di-AMP", "dCMP", "dAMP", "dTMP", "dUMP", "Trehalose-6-phosphate"))]
energy <- energy[energy_vars,]
energy<- energy[,4:177]
head(energy,2)

energySampleMetadata <- function(samples){
    Var1 <- data.frame(str_split_fixed(samples, "_", 2))$X2
    Var1 <- data.frame(str_split_fixed(Var1, "h.", 2))

    Var1$Time <- as.numeric(gsub('[^0-9.]','', Var1$X1))
    Var1$Condition <- gsub("^([[:alpha:]]*).*", "\\1", Var1$X1)
    rownames(Var1) <- samples
    Var1 <- Var1[,c("X2", "Time", "Condition")]
    colnames(Var1) <- c("rep", "Time", "Condition")

    return(Var1)
}

energyMetaData <- suppressWarnings(energySampleMetadata(colnames(energy)))
head(energyMetaData, 2)

A data.frame: 2 × 174
E_CD0h.1E_CD0h.2E_CD0h.3E_C0h.1E_C0h.2E_C0h.3E_D0h.1E_D0h.2E_D0h.3E_CC12h.1E_DD216h.3E_CD216h.1E_CD216h.2E_CD216h.3E_C216h.1E_C216h.2E_C216h.3E_D216h.1E_D216h.2E_D216h.3
<chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr>
L-Arginine14230.228485458912167.778066159313523.653916365714141.283902893215088.824444359110620.788746936116801.569729915 11566.801220426913094.629368731315572.136276612120643.051559785423906.805849176322508.185792717419502.921012854851223.698348693752078.013553988345727.928732908516460.924842393311505.563610551813573.2767046503
Acetyl-CoA49.791073139482955.281393011040657.176639116605274.303624085664 63.039914852177368.351682552118465.747060902590348.529885793303349.701284426961369.360824718819958.324563246821479.039160747503390.981648022357967.000529771049985.251920150975479.357276124521976.997991173570861.486454514375960.592326683117559.9273524809759
A data.frame: 2 × 3
repTimeCondition
<chr><dbl><chr>
E_CD0h.110CD
E_CD0h.220CD
[148]:
doenergyPCA <- function(df, meta_data){
    data <- as.data.frame(lapply(df, as.numeric))
    rownames(data) <- rownames(df)

    data[is.na(data)] <- 0
    data <- data[which(rowSums(data) > 0),]

    ## PCA
    pca <- prcomp(t(data), scale=TRUE)
    pca.var <- pca$sdev^2
    pca.var.per <- round(pca.var/sum(pca.var)*100, 1)
    PCAmat <- as.data.frame(pca$x[,1:3])

    PCAmat <- cbind(PCAmat, meta_data)
    PCAmat <- cbind(PCAmat, t(data))
    return(list(PCAmat=PCAmat, pca.var.per = pca.var.per))
}

energy_PCA <- suppressWarnings(doSugarPCA(energy, energyMetaData))

[149]:
p_pca1 <- PCA_fig(energy_PCA$PCAmat, energy_PCA$pca.var.per, 1, 2, color="Time", shape="Condition", title="energy PCA")
p_pca2 <- PCA_fig(energy_PCA$PCAmat, energy_PCA$pca.var.per, 1, 3, color="Time", shape="Condition", title="energy PCA")

options(repr.plot.width=10, repr.plot.height=4)
plot_grid(p_pca1, p_pca2, ncol=2)

ggsave(p_pca1, file='fig/energy_PCA12.pdf', width=5, height=5)
ggsave(p_pca2, file='fig/energy_PCA13.pdf', width=5, height=5)

../../_images/notebooks_dataQC_Fig1_pca_22_0.png

1.4 energy + sugar PCA#

[ ]:

[ ]:

2 mfuzz#

[153]:
library(Mfuzz)
[185]:
energy_mfuzz <- function(data, n=10){
    dat <- new('ExpressionSet', exprs=as.matrix(data))
    dat <- filter.NA(dat, thres = 0.25)
    dat <- fill.NA(dat, mode="mean")
    dat <- filter.std(dat, min.std=0)
    dat <- standardise(dat)

    m <- mestimate(dat)
    print(m)
    set.seed(2022)
    cl <- mfuzz(dat, c=n, m=m)
    return(list(cl=cl, dat=dat))
}

options(repr.plot.width=6, repr.plot.height=6)
cl_energy <- energy_mfuzz(t(energy_PCA$PCAmat[,7:53]), n=5)

0 genes excluded.
0 genes excluded.
[1] 1.127407
../../_images/notebooks_dataQC_Fig1_pca_28_1.png
[210]:

# 查看每个cluster中的基因个数 cl_energy$cl$size # 提取某个cluster下的基因 cl_energy$cl$cluster[cl_energy$cl$cluster == 1] # 查看基因和cluster之间的membership #cl_energy$cl$membership options(repr.plot.width=15, repr.plot.height=10) mfuzz.plot(cl_energy$dat, cl_energy$cl, mfrow=c(2,3), new.window= FALSE, time.labels = energy_PCA$PCAmat$Time)
  1. 13
  2. 6
  3. 6
  4. 11
  5. 11
Acetyl-CoA
1
Adenine
1
Inosine
1
ADP
1
D-Glucose-6-phosphate
1
D-Glucose-1-phosphate
1
Glycerol-3-phosphate
1
6-Phosphogluconate-acid
1
D-Erythrose-4-phosphate
1
Sedoheptulose-7-phosphate
1
UMP
1
ATP
1
GDP
1
../../_images/notebooks_dataQC_Fig1_pca_29_2.png

2.2 sugar#

[198]:
sugar_mfuzz <- function(data, n=10){
    dat <- new('ExpressionSet', exprs=as.matrix(data))
    dat <- filter.NA(dat, thres = 0.25)
    dat <- fill.NA(dat, mode="mean")
    dat <- filter.std(dat, min.std=0)
    dat <- standardise(dat)

    m <- mestimate(dat)
    print(m)
    set.seed(2022)
    cl <- mfuzz(dat, c=n, m=m)
    return(list(cl=cl, dat=dat))
}

options(repr.plot.width=6, repr.plot.height=6)
cl_sugar <- sugar_mfuzz(t(sugar_PCA$PCAmat[,7:18]), n=6)

0 genes excluded.
0 genes excluded.
[1] 1.425216
../../_images/notebooks_dataQC_Fig1_pca_31_1.png
[199]:

# 查看每个cluster中的基因个数 cl_sugar$cl$size # 提取某个cluster下的基因 cl_sugar$cl$cluster[cl_sugar$cl$cluster == 1] # 查看基因和cluster之间的membership #cl_energy$cl$membership options(repr.plot.width=15, repr.plot.height=10) mfuzz.plot(cl_sugar$dat, cl_sugar$cl, mfrow=c(2,3), new.window= FALSE)
  1. 2
  2. 4
  3. 2
  4. 1
  5. 2
  6. 1
Suc
1
Mal
1
../../_images/notebooks_dataQC_Fig1_pca_32_2.png
[ ]:
RNA_mfuzz <- function(data, n=10){
    dat <- new('ExpressionSet', exprs=as.matrix(data))
    dat <- filter.NA(dat, thres = 0.25)
    dat <- fill.NA(dat, mode="mean")
    dat <- filter.std(dat, min.std=0)
    dat <- standardise(dat)

    m <- mestimate(dat)
    print(m)
    set.seed(2022)
    cl <- mfuzz(dat, c=n, m=m)
    return(list(cl=cl, dat=dat))
}

options(repr.plot.width=6, repr.plot.height=6)
cl_RNA <- RNA_mfuzz(t(sugar_PCA$PCAmat[,7:18]), n=6)

[205]:
head(fpkm_PCA$PCAmat,2)
A data.frame: 2 × 3006
PC1PC2PC3repTimeConditionSolyc01g104030.3Solyc09g007540.3Solyc10g050440.2Solyc01g074000.3Solyc03g119210.1Solyc01g109660.2Solyc08g074480.1Solyc12g044330.2Solyc05g008895.1Solyc08g078870.3Solyc03g034370.1Solyc12g094620.3Solyc09g010800.5.1Solyc09g075210.3
<dbl><dbl><dbl><chr><dbl><chr><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
CD0h.1-1.820171-13.2530832.1394710CD29.4976.7425.2119.93288.892229.743012.131984.41285.081244.82747.196857.774288.54758.06
CD0h.2 1.672188-10.7943530.6439820CD37.0572.7247.3720.98307.102121.112983.492294.18324.881349.20750.097503.054321.57680.50
[206]:
unique(fpkm_PCA$PCAmat$Time)
  1. 0
  2. 12
  3. 24
  4. 36
  5. 48
  6. 60
  7. 72
  8. 84
  9. 96
  10. 120
  11. 168
  12. 216
[ ]:

[ ]: