################################################################################ # Custom R script to prepare a DNA methylation dataset GSE116300 from GEO # for analysis in EpigenCentral portal. The GEO dataset is available at: # https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE116300 # # Instructions: # 1. Download the archive file GSE116300_RAW.tar from the GEO site. # 2. Extract the archive, thereby creating a folder GSE116300_RAW with data. # 3. Place this R script into the folder GSE116300_RAW. # 4. Open the script in Rstudio and run it. # # The script should arrange IDAT files into folders based on the Illumina chip, # remove unused auxiliary GEO files, and create two additional metadata files # by retrieving sample information from GEO. ################################################################################ MAIN_GEO = "GSE116300" ################################################################################ # Rename and organize IDAT files by array chip for the GSE116300 dataset. idats = list.files(pattern = '.idat') # get names of IDAT files chips = gsub('^(GSM\\d+_)?(\\d+)_.+', '\\2', idats) # remove GEO tags for(chip in unique(chips)) { dir.create( chip) # create a folder for each chip ID } # move each IDAT file to its corresponding chip folder idats_to = file.path(chips, gsub('^GSM\\d+_', '', idats)) file.rename(from = idats, to = idats_to) # remove unused microarray platform files file.remove( list.files(pattern = "^GPL")) ################################################################################ # Extract the sample sheet metadata for the GSE116300 dataset. # Install the required packages - only when running the script for the 1st time. install.packages("plyr") if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("GEOquery") # load R packages library(GEOquery) library(plyr) # extract phenotype metadata gse_list = getGEO(GEO = MAIN_GEO, getGPL=FALSE) pheno = pData(gse_list[[1]]) # select columns: sample title, accession ID, characteristics pheno = pheno[, grep("title|geo_accession|:ch1$", colnames(pheno))] # rename columns colnames(pheno) = gsub(":ch1$", "", colnames(pheno)) pheno = rename (pheno, c("title" = "Sample_Name", "phenotype" = "Sample_Group", "slide id" = "Sentrix_ID", "array position" = "Sentrix_Position")) # save to a sample sheet file write.csv(pheno, file = "kabuki.samples.csv", row.names = F, quote = F) ################################################################################ # Extract the design file for the GSE116300 dataset. # identify cases and controls is_case = grepl("KMT2D", pheno$mutation) & grepl("pathogenic", pheno$`variant classification`) is_control = pheno$Sample_Group == 'normal' # create a design table and save to a file design = data.frame(Sample = pheno$Sample_Name, KS = ifelse(is_case, 2, ifelse(is_control, 1, 0))) write.table(design, file = "kabuki.design", sep = "\t", row.names = F, quote = F) # End of script