# generate unique bit compounds
# n - number of compounds to generate
# len - number of bits per compound
# v - vector denoting the probability of each bit being set
# This version generates the fingerprints in bulk and recursively if no duplicates are allowed
# This version stores the compounds as integers
# the single maximum integer that can be stored is 2^31-1, or 31 bits
# this version avoids the use of going to/from the individual bit vectors, which is slow
# This version uses data.table
createbitcmpds<-function(n,len,v=rep(0.5,len),allowduplicates=FALSE)
{
if (n>2^len) stop("number of compounds exceeds compound space")
m=ceiling(len/31)
cmpds=matrix(0,n,m)
for (i in 1:len)
{
x=sample(c(1,0),n,TRUE,c(v[i],1-v[i]))
if (i%%31>0) col=i%/%31+1
else col=i%/%31
# %% refers to the mod operation
# %/% refers to the div operation
# set bits to go from left to right (the first bit goes into the first column, setting 2^31-1)
if (i%%31==0) pow=0
else pow=31-i%%31
cmpds[,col]=cmpds[,col]+x*2^pow
}
if (!allowduplicates)
{
cmpds=as.data.table(cmpds)
cmpds=unique(cmpds)
while (nrow(cmpds)!=n)
{
cmpdssub=createbitcmpds(n-nrow(cmpds),len,v,allowduplicates)
cmpdssub=as.data.table(cmpdssub)
cmpds=rbind(cmpds,cmpdssub)
cmpds=unique(cmpds)
}
}
cmpds=as.matrix(cmpds)
dimnames(cmpds)=NULL
cmpds
}
# obtain bit occurrence frequencies in generated compounds
# cmpds - matrix containing compounds
# bits - the bit positions that are to be returned, specify multiple positions with c, e.g. c(1:3) for bits 1-3
getbitfreq<-function(cmpds,bits)
{
if (!is.matrix(cmpds)) stop("cmpds must be a matrix")
if (max(bits)>31*ncol(cmpds)) stop("specified bit not present in data")
result=rep(0,length(bits))
resultbig=rep(0,ncol(cmpds)*31)
v=rep(0,nrow(cmpds))
for (i in 1:ncol(cmpds))
{
v=cmpds[,i]
for (j in 1:31)
{
resultbig[31*(i-1)+j]=length(which(v-2^(31-j)>=0))
v[v-2^(31-j)>=0]=v[v-2^(31-j)>=0]-2^(31-j)
}
}
result=resultbig[bits]
result=result/nrow(cmpds)
result
}
# Get observed fractions of first bit with changes in the size of the compound
# library relative to the compound space.
# repeat each simulation 10 times, record each individual result for subsequent
# analysis. Use 20 bit compounds, with expected occurrence frequencies of
# 0.025 (smallest), 0.05 (small), 0.1 (medium), 0.2 (large), and 0.4 (largest).
set.seed(271828183) # e rounded and multiplied to reach largest integer
source("createbitcmpds.R")
source("getbitfreq.R")
smallest=matrix(0,31,10)
for (j in seq(1048576/32,1048576*31/32,1048576/32))
{
for (i in 1:10)
{
cmpds=createbitcmpds(j,20,c(0.025,rep(0.5,19)))
temp=getbitfreq(cmpds,1)
smallest[round(j/32768),i]=temp
}
print(paste(round(j*100/32768/31),"% of smallest complete ",date(),sep=""))
}
small=matrix(0,31,10)
for (j in seq(1048576/32,1048576*31/32,1048576/32))
{
for (i in 1:10)
{
cmpds=createbitcmpds(j,20,c(0.05,rep(0.5,19)))
temp=getbitfreq(cmpds,1)
small[round(j/32768),i]=temp
}
print(paste(round(j*100/32768/31),"% of small complete ",date(),sep=""))
}
medium=matrix(0,31,10)
for (j in seq(1048576/32,1048576*31/32,1048576/32))
{
for (i in 1:10)
{
cmpds=createbitcmpds(j,20,c(0.1,rep(0.5,19)))
temp=getbitfreq(cmpds,1)
medium[round(j/32768),i]=temp
}
print(paste(round(j*100/32768/31),"% of medium complete ",date(),sep=""))
}
print("medium complete")
large=matrix(0,31,10)
for (j in seq(1048576/32,1048576*31/32,1048576/32))
{
for (i in 1:10)
{
cmpds=createbitcmpds(j,20,c(0.2,rep(0.5,19)))
temp=getbitfreq(cmpds,1)
large[round(j/32768),i]=temp
}
print(paste(round(j*100/32768/31),"% of large complete ",date(),sep=""))
}
largest=matrix(0,31,10)
for (j in seq(1048576/32,1048576*31/32,1048576/32))
{
for (i in 1:10)
{
cmpds=createbitcmpds(j,20,c(0.4,rep(0.5,19)))
temp=getbitfreq(cmpds,1)
largest[round(j/32768),i]=temp
}
print(paste(round(j*100/32768/31),"% of largest complete ",date(),sep=""))
}
save(smallest,small,medium,large,largest,file="libsize_cmpdspace.RData")
# Get observed fractions for first bit with set expected values
# repeat each simulation 10 times, record each individual result for subsequent analysis
# use 20 bit compounds, with libraries that span 1/1024, 1/128, and 1/8 of compound space
# results in matrices names small, medium, and large
set.seed(271828183) # e rounded and multiplied to reach largest integer
source("createbitcmpds.R")
source("getbitfreq.R")
small=matrix(0,49,10)
for (j in seq(0.02,0.98,0.02))
{
for (i in 1:10)
{
cmpds=createbitcmpds(1024,20,c(j,rep(0.5,19)))
temp=getbitfreq(cmpds,1)
small[round(j*50),i]=temp[1]
}
print(paste(round(j*50*100/49),"percent of small complete"))
}
medium=matrix(0,49,10)
for (j in seq(0.02,0.98,0.02))
{
for (i in 1:10)
{
cmpds=createbitcmpds(8192,20,c(j,rep(0.5,19)))
temp=getbitfreq(cmpds,1)
medium[round(j*50),i]=temp[1]
}
print(paste(round(j*50*100/49),"percent of medium complete"))
}
large=matrix(0,49,10)
for (j in seq(0.02,0.98,0.02))
{
for (i in 1:10)
{
cmpds=createbitcmpds(131072,20,c(j,rep(0.5,19)))
temp=getbitfreq(cmpds,1)
large[round(j*50),i]=temp[1]
}
print(paste(round(j*50*100/49),"percent of large complete"))
}
save(small,medium,large,file="obs_exp_frac.RData")
# validate systematic skew for case examined with smallest effect
# (small small - library 1/1024 of compound space, E = 0.02)
# repeat with newly generated libraries and calculate average O values averaged
# with previous O values
set.seed(314159265) # pi rounded and multiplied to reach largest integer
O=rep(0,1000000)
for (i in 1:1000000)
{
x=createbitcmpds(1024,20,c(0.02,rep(0.5,19)))
O[i]=getbitfreq(x,1)
}
Oss=O
# (small large - library 1/1024 of compound space, E = 0.25)
O=rep(0,1000000)
for (i in 1:1000000)
{
x=createbitcmpds(1024,20,c(0.25,rep(0.5,19)))
O[i]=getbitfreq(x,1)
}
Osl=O