## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", message = FALSE, warning = FALSE, fig.align = "center", crop = NULL ) ## ----eval = FALSE------------------------------------------------------------- # if (!requireNamespace("BiocManager", quietly = TRUE)) # install.packages("BiocManager") # # BiocManager::install("CrcBiomeScreen") ## ----eval = FALSE------------------------------------------------------------- # install.packages(c("remotes")) # remotes::install_github("iChronostasis/CrcBiomeScreen", force = TRUE) ## ----eval = TRUE-------------------------------------------------------------- library(CrcBiomeScreen) ## ----eval = TRUE-------------------------------------------------------------- # Set working directory # setwd("/home/CRCscreening/CRCscreening-Workflow/") # List available datasets in the package # data(package = "CrcBiomeScreen") # Load the toy dataset from curatedMetagenomicData data(Thomas_2018_RelativeAbundance, package = "CrcBiomeScreen") # Create the CrcBiomeScreenObject ## Direct creation from TreeSummarizedExperiment # library(curatedMetagenomicData) # tse <- curatedMetagenomicData("ThomasAM_2018a.relative_abundance", # dryrun = FALSE, rownames = "short")[[1]] tse <- Thomas_2018_RelativeAbundance$`2021-03-31.ThomasAM_2018a.relative_abundance` CrcBiomeScreenObject <- CreateCrcBiomeScreenObjectFromTSE(tse) # Load the simple toy dataset from curatedMetagenomicData data(NHSBCSP_screeningData, package = "CrcBiomeScreen") # Create the CrcBiomeScreenObject for simple toy data CrcBiomeScreenObject_NHSBCSP_screeningData <- CreateCrcBiomeScreenObject( AbsoluteAbundance = NHSBCSP_screeningData[,-c(1:2,646:647)], TaxaData = colnames(NHSBCSP_screeningData[,-c(1:2,646:647)]), SampleData = NHSBCSP_screeningData[,c(1:2,646:647)] ) ## Data exploration head(getTaxaData(CrcBiomeScreenObject)) dim(getRelativeAbundance(CrcBiomeScreenObject)) summary(getSampleData(CrcBiomeScreenObject)$age) ## ----eval = TRUE-------------------------------------------------------------- # Split the taxa into multiple levels[If not done already] # CrcBiomeScreenObject <- SplitTaxas(CrcBiomeScreenObject) head(getTaxaData(CrcBiomeScreenObject)) taxa <- getTaxaData(CrcBiomeScreenObject) colnames(taxa) <- c("Kingdom","Phylum","Class","Order","Family","Genus","Species") setTaxaData(CrcBiomeScreenObject) <- taxa colnames(getTaxaData(CrcBiomeScreenObject)) # Keep only the genus level data CrcBiomeScreenObject <- KeepTaxonomicLevel(CrcBiomeScreenObject,level = "Genus") # Inspect processed data head(getTaxaData(CrcBiomeScreenObject)) dim(getTaxaData(CrcBiomeScreenObject)) ## ----eval = TRUE-------------------------------------------------------------- # Normalize using GMPR (Geometric Mean of Pairwise Ratios) CrcBiomeScreenObject <- NormalizeData(CrcBiomeScreenObject, method = "GMPR", level = "Genus") # Normalize using TSS (Total Sum Scaling) CrcBiomeScreenObject <- NormalizeData(CrcBiomeScreenObject, method = "TSS", level = "Genus") ## ----eval = TRUE-------------------------------------------------------------- set.seed(123) # ------------------------- # Toy taxonomy # ------------------------- toy_taxa_strings <- c( "D_0__Bacteria|D_1__Firmicutes|D_2__Clostridia|D_3__OrderA|D_4__FamilyA|D_5__GenusA", "D_0__Bacteria|D_1__Firmicutes|D_2__Clostridia|D_3__OrderB|D_4__FamilyB|D_5__GenusB", "D_0__Bacteria|D_1__Firmicutes|D_2__Clostridia|D_3__OrderC|D_4__FamilyC|D_5__GenusC", "D_0__Bacteria|D_1__Bacteroidetes|D_2__Bacteroidia|D_3__OrderD|D_4__FamilyD|D_5__GenusD", "D_0__Bacteria|D_1__Bacteroidetes|D_2__Bacteroidia|D_3__OrderE|D_4__FamilyE|D_5__GenusE", "D_0__Bacteria|D_1__Proteobacteria|D_2__Gammaproteobacteria|D_3__OrderF|D_4__FamilyF|D_5__GenusF" ) toy_taxa <- data.frame( Taxa = toy_taxa_strings, stringsAsFactors = FALSE ) # ------------------------- # Toy training abundance # 6 taxa x 12 samples # ------------------------- train_samples <- paste0("S", 1:12) toy_abs <- matrix( c( rpois(6 * 6, lambda = 54.8887777), # controls rpois(6 * 6, lambda = 55) # CRC ), nrow = 6, ncol = 12 ) rownames(toy_abs) <- toy_taxa_strings colnames(toy_abs) <- train_samples toy_abs <- as.data.frame(toy_abs) toy_sample <- data.frame( number_reads = rep(10000, 12), study_condition = c(rep("control", 6), rep("CRC", 6)), row.names = train_samples, stringsAsFactors = FALSE ) # ------------------------- # 1. Create toy training data # ------------------------- toy_obj <- CreateCrcBiomeScreenObject( AbsoluteAbundance = toy_abs, TaxaData = toy_taxa, SampleData = toy_sample ) # ------------------------- # 2. Keep one taxonomic level and normalize # ------------------------- toy_obj <- SplitTaxas(toy_obj) toy_obj <- KeepTaxonomicLevel(toy_obj, level = "Genus") toy_obj <- NormalizeData(toy_obj, method = "TSS", level = "Genus") # ------------------------- # 3. Split dataset # ------------------------- toy_obj <- SplitDataSet( toy_obj, label = c("control", "CRC"), partition = 0.7 ) # Perform quality control using cmdscale toy_obj <- qcByCmdscale( toy_obj, TaskName = "toy_obj", normalize_method = "TSS", plot = FALSE ) # Check class balance in the training data checkClassBalance(getModelData(toy_obj)$TrainLabel) ## ----eval = TRUE-------------------------------------------------------------- # ------------------------------- # 4. Train and Evaluate RF model # ------------------------------- toy_obj <- TrainModels( toy_obj, model_type = "RF", TaskName = "toy_rf", ClassWeights = FALSE, TrueLabel = "CRC", num_cores = 1, n_cv = 2 ) toy_obj <- EvaluateModel( toy_obj, model_type = "RF", TaskName = "ToyData_RF_Test", TrueLabel = "CRC", PlotAUC = FALSE ) ## ----eval = TRUE-------------------------------------------------------------- # ------------------------- # 5. Create toy validation data # ------------------------- val_taxa_strings <- c( "D_0__Bacteria|D_1__Firmicutes|D_2__Clostridia|D_3__OrderA|D_4__FamilyA|D_5__GenusA", "D_0__Bacteria|D_1__Firmicutes|D_2__Clostridia|D_3__OrderB|D_4__FamilyB|D_5__GenusB", "D_0__Bacteria|D_1__Firmicutes|D_2__Clostridia|D_3__OrderC|D_4__FamilyC|D_5__GenusC", "D_0__Bacteria|D_1__Bacteroidetes|D_2__Bacteroidia|D_3__OrderD|D_4__FamilyD|D_5__GenusD", "D_0__Bacteria|D_1__Bacteroidetes|D_2__Bacteroidia|D_3__OrderE|D_4__FamilyE|D_5__GenusE", "D_0__Bacteria|D_1__Proteobacteria|D_2__Gammaproteobacteria|D_3__OrderF|D_4__FamilyF|D_5__GenusF" ) val_taxa <- data.frame( Taxa = val_taxa_strings, stringsAsFactors = FALSE ) val_samples <- paste0("V", 1:8) val_abund <- matrix( c( rpois(6 * 4, lambda = 38), rpois(6 * 4, lambda = 48) ), nrow = 6, ncol = 8 ) rownames(val_abund) <- val_taxa_strings colnames(val_abund) <- val_samples val_abund <- as.data.frame(val_abund) val_sample <- data.frame( number_reads = rep(10000, 8), study_condition = c(rep("control", 4), rep("CRC", 4)), condition = c(rep("control", 4), rep("CRC", 4)), row.names = val_samples, stringsAsFactors = FALSE ) val_obj <- CreateCrcBiomeScreenObject( AbsoluteAbundance = val_abund, TaxaData = val_taxa, SampleData = val_sample ) val_obj <- SplitTaxas(val_obj) val_obj <- KeepTaxonomicLevel(val_obj, level = "Genus") val_obj <- NormalizeData(val_obj, method = "TSS", level = "Genus") # ------------------------- # 6. Align features # ------------------------- train_norm <- getNormalizedData(toy_obj) val_norm <- getNormalizedData(val_obj) common_features <- intersect(colnames(train_norm), colnames(val_norm)) setNormalizedData(toy_obj) <- train_norm[, common_features, drop = FALSE] setNormalizedData(val_obj) <- val_norm[, common_features, drop = FALSE] # ------------------------- # 7. Validate model # ------------------------- validated_obj <- ValidateModelOnData( toy_obj, ValidationData = val_obj, model_type = "RF", TaskName = "toy_validation", TrueLabel = "CRC", PlotAUC = FALSE ) ## ----eval = TRUE-------------------------------------------------------------- # ---------------------------------- # 8. Minimal end-to-end RunScreening # ---------------------------------- toy_obj2 <- CreateCrcBiomeScreenObject( AbsoluteAbundance = toy_abs, TaxaData = toy_taxa, SampleData = toy_sample ) toy_obj2 <- SplitTaxas(toy_obj2) toy_obj2 <- KeepTaxonomicLevel(toy_obj2, level = "Genus") toy_obj2 <- NormalizeData(toy_obj2, method = "TSS", level = "Genus") toy_obj2 <- RunScreening( obj = toy_obj2, model_type = "RF", partition = 0.7, split.requirement = list( label = c("control", "CRC"), condition_col = "study_condition" ), ClassWeights = FALSE, n_cv = 2, num_cores = 1, TaskName = "RF_TSS_toydata", ValidationData = val_obj, TrueLabel = "CRC" ) ## ----eval = TRUE-------------------------------------------------------------- # ------------------------- # 9. Prediction on new data # ------------------------- toy_obj <- PredictCrcBiomeScreen( toy_obj, newdata = getNormalizedData(val_obj), model_type = "RF" ) # Inspect the predictions head(getPredictResult(toy_obj)$RF) #> control CRC #> V1 0.5122222 0.4877778 #> V2 0.6066667 0.3933333 #> V3 0.4205556 0.5794444 #> V4 0.5255556 0.4744444 #> V5 0.4772222 0.5227778 #> V6 0.4677778 0.5322222 predictions <- getPredictResult(toy_obj)$RF # ------------------------- # 10. Evaluate predictions # ------------------------- evaluation_results <- EvaluateCrcBiomeScreen( predictions = predictions, true_labels = getSampleData(val_obj)$study_condition, TrueLabel = "CRC", TaskName = "Toy_Prediction_Eval" ) ## ----eval = TRUE-------------------------------------------------------------- # Check the available labels in the data table(getSampleData(CrcBiomeScreenObject)$study_condition) # Filter data to keep only CRC and control samples CrcBiomeScreenObject <- FilterDataSet( CrcBiomeScreenObject, label = c("CRC", "control"), condition_col = "study_condition" ) # Verify the filtering results table(getSampleData(CrcBiomeScreenObject)$study_condition) # Split data into training (70%) and test (30%) sets CrcBiomeScreenObject <- SplitDataSet( CrcBiomeScreenObject, label = c("control", "CRC"), partition = 0.7 ) ## ----eval = TRUE-------------------------------------------------------------- # Perform quality control using cmdscale CrcBiomeScreenObject <- qcByCmdscale( CrcBiomeScreenObject, TaskName = "Normalize_ToyData_filtered_qc", normalize_method = "GMPR", plot = FALSE ) # Check class balance in the training data checkClassBalance(getModelData(CrcBiomeScreenObject)$TrainLabel) ## ----eval = FALSE------------------------------------------------------------- # # Assuming 'CrcBiomeScreenObject' contains a trained XGBoost model # # and 'MyNewData' is your new dataset (e.g., another CrcBiomeScreenObject or a data frame). # # # Generate predictions for the new dataset # ValidationData_filtered_qc <- PredictCrcBiomeScreen( # CrcBiomeScreenObject, # newdata = getNormalizedData(ValidationData_filtered_qc), # Use the appropriate data slot from your new data # model_type = "RF" # ) # # Inspect the predictions # head(getPredictResult(ValidationData_filtered_qc)$RF) # predictions <- getPredictResult(ValidationData_filtered_qc)$RF ## ----eval = FALSE------------------------------------------------------------- # # Assuming 'MyNewData' has a 'study_condition' column in its SampleData # # and the true positive label is "CRC". # # # Evaluate the predictions # evaluation_results <- EvaluateCrcBiomeScreen( # predictions = predictions, # true_labels = getSampleData(ValidationData_filtered_qc)$study_condition, # Provide the actual true labels # TrueLabel = "CRC", # Specify the positive class label # TaskName = "MyNewDatasetEvaluation" # A name for the output plot/files # ) # # # View the evaluation results # print(evaluation_results$AUC) ## ----full_pipeline, eval=FALSE------------------------------------------------ # # Load the toy dataset from curatedMetagenomicData # data(Thomas_2018_RelativeAbundance, package = "CrcBiomeScreen") # # # Create the CrcBiomeScreenObject # tse <- Thomas_2018_RelativeAbundance$`2021-03-31.ThomasAM_2018a.relative_abundance` # CrcBiomeScreenObject <- CreateCrcBiomeScreenObjectFromTSE(tse) # # taxa <- getTaxaData(CrcBiomeScreenObject) # colnames(taxa) <- c("Kingdom","Phylum","Class","Order","Family","Genus","Species") # # setTaxaData(CrcBiomeScreenObject) <- taxa # # # Keep only the genus level data # CrcBiomeScreenObject <- KeepTaxonomicLevel(CrcBiomeScreenObject,level = "Genus") # # # Inspect processed data # head(getTaxaData(CrcBiomeScreenObject)) # # # Normalize using GMPR (Geometric Mean of Pairwise Ratios) # CrcBiomeScreenObject <- NormalizeData(CrcBiomeScreenObject, method = "GMPR", level = "Genus") # # # Filter data to keep only CRC and control samples # CrcBiomeScreenObject <- FilterDataSet( # CrcBiomeScreenObject, # label = c("CRC", "control"), # condition_col = "study_condition" # ) # # # Perform quality control using cmdscale # CrcBiomeScreenObject <- qcByCmdscale( # CrcBiomeScreenObject, # TaskName = "Normalize_ToyData_filtered_qc", # normalize_method = "GMPR", # plot = FALSE # ) # # # Verify the filtering results # table(getSampleData(CrcBiomeScreenObject)$study_condition) # # # Split data into training (70%) and test (30%) sets # CrcBiomeScreenObject <- SplitDataSet( # CrcBiomeScreenObject, # label = c("control", "CRC"), # partition = 0.7 # ) # # # Check class balance in the training data # checkClassBalance(getModelData(CrcBiomeScreenObject)$TrainLabel) # # # Load validation data # data("ZellerG_2014_RelativeAbundance") # # ValidationData_tse <- ZellerG_2014_RelativeAbundance$`2021-03-31.ZellerG_2014.relative_abundance` # # Create CrcBiomeScreenObject for validation data # ValidationData <- CreateCrcBiomeScreenObjectFromTSE(ValidationData_tse) # # # Process validation data similarly to training data # # ValidationData <- SplitTaxas(ValidationData) # taxa <- getTaxaData(ValidationData) # colnames(taxa) <- c("Kingdom","Phylum","Class","Order","Family","Genus","Species") # # setTaxaData(ValidationData) <- taxa # # ValidationData <- KeepTaxonomicLevel(ValidationData,level = "Genus") # ValidationData <- NormalizeData(ValidationData, method = "GMPR", level = "Genus") # # # Align features between training and validation data # val_norm <- getNormalizedData(ValidationData) # crc_norm <- getNormalizedData(CrcBiomeScreenObject) # # Ensure consistent features between training and validation data # val_norm <- val_norm[, colnames(val_norm) %in% colnames(crc_norm)] # crc_norm <- crc_norm[, colnames(crc_norm) %in% colnames(val_norm)] # # Set back the normalized data # setNormalizedData(ValidationData) <- val_norm # setNormalizedData(CrcBiomeScreenObject) <- crc_norm # # # Filter validation data # ValidationData_filtered <- FilterDataSet( # ValidationData, # label = c("CRC", "control"), # condition_col = "study_condition" # ) # # # Quality control for validation data # ValidationData_filtered_qc <- qcByCmdscale( # ValidationData_filtered, # TaskName = "Normalize_ValidationData_filtered_qc", # normalize_method = "GMPR", # plot = FALSE # ) # # # Run the complete screening workflow # ValidationData_filtered_qc <- RunScreening( # # Input data # obj = CrcBiomeScreenObject, # # Model and data splitting # model_type = "RF", # partition = 0.7, # split.requirement = list( # label = c("control", "CRC"), # condition_col = "study_condition" # ), # ClassWeights = FALSE, # # # Cross-validation and parallelization # n_cv = 10, # num_cores = 10, # # # Task and output naming # TaskName = "RF_GMPR_toydata", # # # External validation # ValidationData = ValidationData_filtered_qc, # TrueLabel = "CRC" # ) # # # Example with XGBoost model # ValidationData_filtered_qc <- RunScreening( # CrcBiomeScreenObject, # model = "RF", # partition = 0.7, # split.requirement = list( # label = c("control", "CRC"), # condition_col = "study_condition" # ), # ClassWeights = TRUE, # n_cv = 10, # num_cores = 10, # TaskName = "RF_GMPR_toydata", # ValidationData = ValidationData_filtered_qc, # TrueLabel = "CRC" # ) ## ----sessioninfo-------------------------------------------------------------- sessionInfo()