R code snippets

# read data from database
con <- dbConnect(MySQL(), host="mysql.database.com", user="someone", password="secret", dbname="somedb")
d <- dbReadTable(con, "some_table")
dbDisconnect(con)

# read in a delimited text file 
data <- read.table(file="somefile.txt", sep="\t", header=T, quote="", na.string="", as.is=T, check.names=F)

# generate random data.frame for testing 
x <- data.frame(c1=rnorm(10), c2=rnorm(10), f1=sample(c("a", "b", "c", "d"), 10, replace=T), f2=sample(c("a", "b", "c", "d"), 10, replace=T))

# filter data for 2 way ANOVA function and test
set.seed(1872011)
# generate test data
nsize <- 50
x <- data.frame(c1=rnorm(nsize), c2=rnorm(nsize), f1=sample(c("a", "b", "c", "d"), nsize, replace=T), f2=sample(c("a", "b", "c", "d"), nsize, replace=T))
filterForTwoWayAnova <- function(d, x, y) {
  if (!is.data.frame(d)) {
    stop("Argument d is not a data.frame.")
  }
  createCountsMatrix <- function(x, y) {
    x <- as.character(x)
    y <- as.character(y)
    counts <- matrix(0, nrow=length(unique(x)), ncol=length(unique(y)))
    rownames(counts) <- unique(x)
    colnames(counts) <- unique(y)
    for (i in rownames(counts)) {
      for (j in rownames(counts)) {
        counts[i, j] <- sum(x==i & y==j)
      }
    }
    return(counts)
  }
  countMatrixOk <- function(counts) {
    for (i in rownames(counts)) {
      for (j in colnames(counts)) {
        if (counts[i, j] < 2) {
          return(FALSE)
        }
      }
    }
    return(TRUE)
  }
  trimCountMatrix <- function(count, type, index) {
    if (type=="row") {
      return(count[-which(rownames(count)==index), ])
    }
    else {
      return(count[, -which(colnames(count)==index)])
    }
  }
  # change to character
  d[, x] <- as.character(d[, x])
  d[, y] <- as.character(d[, y])
  # get the counts matrix
  counts <- createCountsMatrix(d[, x], d[, y])
  # check to see if the data is ok
  if (countMatrixOk(counts)) {
    return(d)
  }
  else {
    # get the list of possible deletions
    possibleDeletions <- list()
    possibleDeletions <- append(possibleDeletions, llply(rownames(counts)[which(apply(counts, 1, function(x) sum(x<2)) > 0)], function(x) return(list(type="row", index=x))))
    possibleDeletions <- append(possibleDeletions, llply(colnames(counts)[which(apply(counts, 2, function(x) sum(x<2)) > 0)], function(x) return(list(type="col", index=x))))
    # find solutions
    solutions <- list()
    for (p in permn(1:length(possibleDeletions))) {
      c <- counts
      for (i in p) {
        c <- trimCountMatrix(c, possibleDeletions[[i]]$type, possibleDeletions[[i]]$index)
        if (is.null(nrow(c)) || is.null(ncol(c)) || nrow(c) < 2 || ncol(c) < 2) {
          break
        }
        else if(countMatrixOk(c)) {
          solutions[[length(solutions)+1]] <- list(solution=c, sum=sum(c), dim=dim(c)[1] + dim(c)[2])
          break
        }
      }
    }
    if (length(solutions) > 0) {
      solution <- solutions[[order(sapply(solutions, function(x) x$sum), sapply(solutions, function(x) x$sum), decreasing=T)[1]]]
      d <- d[d[, x] %in% rownames(solution$solution) & d[, y] %in% colnames(solution$solution), ]
      d[, x] <- factor(d[, x])
      d[, y] <- factor(d[, y])
      return(d)
    }
    else {
      return(NULL)
    }
  }
}
x1 <- filterForTwoWayAnova(x, "f1", "f2")

Leave a Reply

Your email address will not be published. Required fields are marked *