Edit: Updated solution using the fast melt/dcast methods implemented in data.table
versions >= 1.9.0. Go here for more info.
require(data.table)
require(reshape2)
dt <- as.data.table(df)
# melt data.table
dt.m <- melt(dt, id=c("marker", "alleleA", "alleleB"),
variable.name="id", value.name="val")
dt.m[, id := gsub("\.[0-9]+$", "", id)] # replace `.[0-9]` with nothing
# aggregation
dt.m <- dt.m[, list(alleleA = alleleA[1],
alleleB = alleleB[1], val = max(val)),
keyby=list(marker, id)][val <= 0.8, val := NA]
# casting back
dt.c <- dcast.data.table(dt.m, marker + alleleA + alleleB ~ id)
# marker alleleA alleleB X345 X346 X818
# 1: chr3_21902130_21902131_A_T A T NA 0.8626 0.8626
# 2: chr3_21902134_21902135_T_C T C NA NA NA
# 3: kgp5209280_chr3_21902067 T A 1 1.0000 1.0000
Solution 1: Probably not the best way, but this is what I could think of at the moment:
mm <- t(apply(df[-(1:3)], 1, function(x) tapply(x, gl(3,3), max)))
mode(mm) <- "numeric"
mm[mm < 0.8] <- NA
# you can set the column names of mm here if necessary
out <- cbind(df[, 1:3], mm)
# marker alleleA alleleB 1 2 3
# 1 kgp5209280_chr3_21902067 T A 1.0000 1 1.0000
# 2 chr3_21902130_21902131_A_T A T 0.8626 NA 0.8626
# 3 chr3_21902134_21902135_T_C T C NA NA NA
gl(3,3)
gives a factor with values 1,1,1,2,2,2,3,3,3
with levels 1,2,3
. That is, tapply
will take the values x
3 at a time and get their max
(first 3, next 3 and the last 3). And apply
sends each row one by one.
Solution 2: A data.table
solution with melt
and cast
within data.table
without using reshape
or reshape2
:
require(data.table)
dt <- data.table(df)
# melt your data.table to long format
dt.melt <- dt[, list(id = names(.SD), val = unlist(.SD)),
by=list(marker, alleleA, alleleB)]
# replace `.[0-9]` with nothing
dt.melt[, id := gsub("\.[0-9]+$", "", id)]
# get max value grouping by marker and id
dt.melt <- dt.melt[, list(alleleA = alleleA[1],
alleleB = alleleB[1],
val = max(val)),
keyby=list(marker, id)][val <= 0.8, val := NA]
# edit mnel (use setattr(,'names') to avoid copy by `names<-` within `setNames`
dt.cast <- dt.melt[, as.list(setattr(val,'names', id)),
by=list(marker, alleleA, alleleB)]
# marker alleleA alleleB X345 X346 X818
# 1: chr3_21902130_21902131_A_T A T NA 0.8626 0.8626
# 2: chr3_21902134_21902135_T_C T C NA NA NA
# 3: kgp5209280_chr3_21902067 T A 1 1.0000 1.0000