################################################ # Random Variables # - using R to generate random numbers ################################################ # we can choose from lots of different distributions # such as norm, unif, binom, etc... # 'r' refers to a random number # 'p' refers to a quantile of a given value # 'q' refers to the value of a given quantile # first things first, let's set the seed. This is very important! # if you want to be able to recreate your random numbers, you should always set the seed. set.seed(555) # Normal Distribution: # take 1000 draws from the standard normal distribution ?rnorm # we see that standard normal (ie mean = 0, sd = 1) is default rand.draw = rnorm(1000) # alternatively, we can specify: rand.draw2 = rnorm(1000, mean = 0, sd = 1) # let's plot our results plot(rand.draw) plot(density(rand.draw)) plot(rand.draw2) plot(density(rand.draw2)) # the summary function # the summary function will return the quantiles of a vector summary(rand.draw) # looks like we are a little skewed... let's try more data points! rand.draw3 = rnorm(10000) plot(density(rand.draw3)) summary(rand.draw3) # we can check what quantile a certain value is in a distribution pnorm(.5, mean = 0, sd = 1) pnorm(.6) # or what value is a given quantile qnorm(0.25) qnorm(.5) # of course, we don't have to use the standard normal rand.draw4 = rnorm(1000, mean = 25, sd = 10) plot(density(rand.draw4)) # in fact, we don't have to use the normal distribution at all rand.unif = runif(1000, min = -5, max = 5) plot(rand.unif) plot(density(rand.unif)) # looks pretty messy... how about if we increase n? rand.unif2 = runif(10000, min = -5, max = 5) plot(rand.unif2) plot(density(rand.unif2)) # or, the binomial distribution ?rbinom rand.binom = rbinom(1000, size = 1, prob = 0.3) plot(rand.binom) plot(density(rand.binom)) # what if, instead of just generating random numbers, we want to generate samples of a population using a fixed margin? # NOTE: This is the function you want to use on the homework problem 3.c., not rbinom since rbinom will not have fixed margins ?sample # let's create a vector, and take a random 100 observation sample: x = rnorm(1000) length(x) rand.sample = sample(x, 100, replace = FALSE) length(rand.sample) plot(density(x)) plot(density(rand.sample)) # how could we do this using indexes? # pass in a vector of integers from 1:length(x), and sample 100 of those integers rand.sample.index = sample(1:length(x), 100, replace = FALSE) rand.sample.index length(rand.sample.index) # now we can index by this object x.sample = x[rand.sample.index] length(x) length(x.sample) # When might we use random numbers? # Lady Tasting Tea # We learned that there are two types of Fisher exact tests, one with fixed margins and one without # Let's pretend we are Fisher, and we want to randomly assign the tea cups to milk first or tea first. tea.cups = c(1:8) tea.cups # In the original experiment, Fisher told the lady that there were four milk first and four tea first cups, therefore there are fixed margins. This means that we want to randomly generate assignment of milk first for four cups out of the 8. milk.first = sample(tea.cups, 4, replace = FALSE) milk.first # the rest, by design, must be tea first tea.first = tea.cups[-milk.first] tea.first # let's make sure that all of the cups are accounted for: sort(c(milk.first, tea.first)) # if we want to create a treatment assignment variable of zeros and ones, we can create it using the above results as indexes milk.first.dummy = rep(0, 8) milk.first.dummy milk.first.dummy[milk.first] = 1 milk.first.dummy milk.first tea.first.dummy = rep(0, 8) tea.first.dummy[tea.first] = 1 tea.first.dummy # However, we saw that the fixed margins doesn't provide much power in its design. Therefore, Fisher could have randomly assigned each cup to milk first or tea first. milk.first2 = rbinom(8, size = 1, prob = 0.5) milk.first2 tea.first2 = as.integer(!milk.first2) tea.first2 ############################ # Simulation: # We could also use random numbers to do a simulation. # Example: Correct Coverage Simulation ############################ set.seed(15) n = 2000 n.simul = 1000 c = 0.01 count = 0 for( i in 1:n.simul) { # Use random numbers to simulate some data, an X and an error X = rnorm(n, mean = 5, sd = 2) eps = rnorm(n, mean = 0, sd = 1) # Create a true model and make Y using that model Y = 5 + 10 * X + eps # Run OLS ols = summary(lm(Y ~ 1 + X)) # Now, test our hypothesis beta = ols$coefficients[2, 1] se = ols$coefficients[2, 2] t = (beta - 10) / se pval = 1 - pt(t, df = n - 2) if(pval <= c ) { count = count + 1 } } # our coverage is the number of counts for which our hypothesis was true as a percentage of the number of simulations cov = count / n.simul count cat( "\n We should have coverage of ", c, "and we are approximately correct because we reject the null", cov * 100, "percent of the time \n")