# generate unique bit compounds

# n - number of compounds to generate

# len - number of bits per compound

# v - vector denoting the probability of each bit being set

# This version generates the fingerprints in bulk and recursively if no duplicates are allowed

# This version stores the compounds as integers

# the single maximum integer that can be stored is 2^31-1, or 31 bits

# this version avoids the use of going to/from the individual bit vectors, which is slow

# This version uses data.table

createbitcmpds<-function(n,len,v=rep(0.5,len),allowduplicates=FALSE)

{

if (n>2^len) stop("number of compounds exceeds compound space")

m=ceiling(len/31)

cmpds=matrix(0,n,m)

for (i in 1:len)

{

x=sample(c(1,0),n,TRUE,c(v[i],1-v[i]))

if (i%%31>0) col=i%/%31+1

else col=i%/%31

# %% refers to the mod operation

# %/% refers to the div operation

# set bits to go from left to right (the first bit goes into the first column, setting 2^31-1)

if (i%%31==0) pow=0

else pow=31-i%%31

cmpds[,col]=cmpds[,col]+x*2^pow

}

if (!allowduplicates)

{

cmpds=as.data.table(cmpds)

cmpds=unique(cmpds)

while (nrow(cmpds)!=n)

{

cmpdssub=createbitcmpds(n-nrow(cmpds),len,v,allowduplicates)

cmpdssub=as.data.table(cmpdssub)

cmpds=rbind(cmpds,cmpdssub)

cmpds=unique(cmpds)

}

}

cmpds=as.matrix(cmpds)

dimnames(cmpds)=NULL

cmpds

}

# obtain bit occurrence frequencies in generated compounds

# cmpds - matrix containing compounds

# bits - the bit positions that are to be returned, specify multiple positions with c, e.g. c(1:3) for bits 1-3

getbitfreq<-function(cmpds,bits)

{

if (!is.matrix(cmpds)) stop("cmpds must be a matrix")

if (max(bits)>31*ncol(cmpds)) stop("specified bit not present in data")

result=rep(0,length(bits))

resultbig=rep(0,ncol(cmpds)*31)

v=rep(0,nrow(cmpds))

for (i in 1:ncol(cmpds))

{

v=cmpds[,i]

for (j in 1:31)

{

resultbig[31*(i-1)+j]=length(which(v-2^(31-j)>=0))

v[v-2^(31-j)>=0]=v[v-2^(31-j)>=0]-2^(31-j)

}

}

result=resultbig[bits]

result=result/nrow(cmpds)

result

}

# Get observed fractions of first bit with changes in the size of the compound

# library relative to the compound space.

# repeat each simulation 10 times, record each individual result for subsequent

# analysis. Use 20 bit compounds, with expected occurrence frequencies of

# 0.025 (smallest), 0.05 (small), 0.1 (medium), 0.2 (large), and 0.4 (largest).

set.seed(271828183) # e rounded and multiplied to reach largest integer

source("createbitcmpds.R")

source("getbitfreq.R")

smallest=matrix(0,31,10)

for (j in seq(1048576/32,1048576*31/32,1048576/32))

{

for (i in 1:10)

{

cmpds=createbitcmpds(j,20,c(0.025,rep(0.5,19)))

temp=getbitfreq(cmpds,1)

smallest[round(j/32768),i]=temp

}

print(paste(round(j*100/32768/31),"% of smallest complete ",date(),sep=""))

}

small=matrix(0,31,10)

for (j in seq(1048576/32,1048576*31/32,1048576/32))

{

for (i in 1:10)

{

cmpds=createbitcmpds(j,20,c(0.05,rep(0.5,19)))

temp=getbitfreq(cmpds,1)

small[round(j/32768),i]=temp

}

print(paste(round(j*100/32768/31),"% of small complete ",date(),sep=""))

}

medium=matrix(0,31,10)

for (j in seq(1048576/32,1048576*31/32,1048576/32))

{

for (i in 1:10)

{

cmpds=createbitcmpds(j,20,c(0.1,rep(0.5,19)))

temp=getbitfreq(cmpds,1)

medium[round(j/32768),i]=temp

}

print(paste(round(j*100/32768/31),"% of medium complete ",date(),sep=""))

}

print("medium complete")

large=matrix(0,31,10)

for (j in seq(1048576/32,1048576*31/32,1048576/32))

{

for (i in 1:10)

{

cmpds=createbitcmpds(j,20,c(0.2,rep(0.5,19)))

temp=getbitfreq(cmpds,1)

large[round(j/32768),i]=temp

}

print(paste(round(j*100/32768/31),"% of large complete ",date(),sep=""))

}

largest=matrix(0,31,10)

for (j in seq(1048576/32,1048576*31/32,1048576/32))

{

for (i in 1:10)

{

cmpds=createbitcmpds(j,20,c(0.4,rep(0.5,19)))

temp=getbitfreq(cmpds,1)

largest[round(j/32768),i]=temp

}

print(paste(round(j*100/32768/31),"% of largest complete ",date(),sep=""))

}

save(smallest,small,medium,large,largest,file="libsize_cmpdspace.RData")

# Get observed fractions for first bit with set expected values

# repeat each simulation 10 times, record each individual result for subsequent analysis

# use 20 bit compounds, with libraries that span 1/1024, 1/128, and 1/8 of compound space

# results in matrices names small, medium, and large

set.seed(271828183) # e rounded and multiplied to reach largest integer

source("createbitcmpds.R")

source("getbitfreq.R")

small=matrix(0,49,10)

for (j in seq(0.02,0.98,0.02))

{

for (i in 1:10)

{

cmpds=createbitcmpds(1024,20,c(j,rep(0.5,19)))

temp=getbitfreq(cmpds,1)

small[round(j*50),i]=temp[1]

}

print(paste(round(j*50*100/49),"percent of small complete"))

}

medium=matrix(0,49,10)

for (j in seq(0.02,0.98,0.02))

{

for (i in 1:10)

{

cmpds=createbitcmpds(8192,20,c(j,rep(0.5,19)))

temp=getbitfreq(cmpds,1)

medium[round(j*50),i]=temp[1]

}

print(paste(round(j*50*100/49),"percent of medium complete"))

}

large=matrix(0,49,10)

for (j in seq(0.02,0.98,0.02))

{

for (i in 1:10)

{

cmpds=createbitcmpds(131072,20,c(j,rep(0.5,19)))

temp=getbitfreq(cmpds,1)

large[round(j*50),i]=temp[1]

}

print(paste(round(j*50*100/49),"percent of large complete"))

}

save(small,medium,large,file="obs_exp_frac.RData")

# validate systematic skew for case examined with smallest effect

# (small small - library 1/1024 of compound space, E = 0.02)

# repeat with newly generated libraries and calculate average O values averaged

# with previous O values

set.seed(314159265) # pi rounded and multiplied to reach largest integer

O=rep(0,1000000)

for (i in 1:1000000)

{

x=createbitcmpds(1024,20,c(0.02,rep(0.5,19)))

O[i]=getbitfreq(x,1)

}

Oss=O

# (small large - library 1/1024 of compound space, E = 0.25)

O=rep(0,1000000)

for (i in 1:1000000)

{

x=createbitcmpds(1024,20,c(0.25,rep(0.5,19)))

O[i]=getbitfreq(x,1)

}

Osl=O