<- function(number1, number2){
add_two_numbers return(number1 + number2)
}
add_two_numbers(number1 = 2, number2 = 9)
[1] 11
# We can call function without argument names but it is not good practice e.g.
add_two_numbers(2, 9)
[1] 11
Premise for Inverse CDF method: When you want random samples from a distribution.
Here we use examples from distributions that are easy to sample from so we can compare our results with existing functions. In reality this method would be used for uncommon distributions.
First, let’s recall/introduce how we can make our own functions in in R.
Toy example:
<- function(number1, number2){
add_two_numbers return(number1 + number2)
}
add_two_numbers(number1 = 2, number2 = 9)
[1] 11
# We can call function without argument names but it is not good practice e.g.
add_two_numbers(2, 9)
[1] 11
Before we start generating random numbers, let’s set the seed to make our experiments reproducible.
To understand why we set a seed run the chunk below and compare the output. Then run it again and compare between runs.
rnorm(1)
[1] 2.274749
rnorm(1)
[1] 0.9803857
In order to make our analysis reproducible we need to set the seed, allowing another person to run the script and get the same results.
set.seed(345454)
What is the inverse CDF of an exponential distribution? Uh oh. How can we calculate this? R has functions that compute quantiles, but quantiles return the same value as inverse cdfs.
For example, if \(X \sim Exp(0.1)\), then number \(c\) such that \(Pr(X < c) = 0.6\) is \(F^{-1}(0.6)\) and can be obtained by the following R code
qexp(0.6, rate = 0.1)
[1] 9.162907
Equipped with this observation, let’s compute
<- function(num_draws, rate = 1){
rexp_isi_buds <- runif(num_draws) # Random draws from uniform distirbution
runif_draws return(qexp(runif_draws, rate = rate)) # Plug draws into cdf
}
<- rexp_isi_buds(100000, rate = 0.1) my_exp_sample
We can plot our observations to visually asses that we did it correctly. Note the difference in default bin sizes for the two plotting methods.
# base R plot for reference
hist(my_exp_sample)
# ggplot
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.2 ✔ tibble 3.3.0
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.4
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data.frame("my_exp_sample" = my_exp_sample) %>% # Need data frame for ggplot()
ggplot(aes(x = my_exp_sample)) +
geom_histogram() +
theme_bw()# Try changing the number of bins by setting bins = 10
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
If you have time : spend a few minutes improving the plot above with recommendations learned last week!
Let’s compare our simulations with simulations produced by the R function rexp()
(base R function)
<- rexp(100000, 0.1)
my_exp_sample2
qqplot(my_exp_sample, my_exp_sample2)
abline(0,1)
We can also compare empirical and theoretical means and standard deviations
# Parentheses tell R to print the value being stored
<- mean(my_exp_sample)) (emp_exp_mean
[1] 9.99156
<- 1/0.1) (theor_exp_mean
[1] 10
<- sd(my_exp_sample)) (emp_exp_sd
[1] 10.07315
<- sqrt(1/0.1^2)) (theor_exp_sd
[1] 10
After the example, can you better explain why we would want to use the inverse cdf method to generate random samples from an uncommon distribution?
Write a function to generate realizations of a normal random variable with a user specified mean and variance. Compare results of your simulations and simulation produced using rnorm().
<- function(num_draws, mean,var){
rnorm_isi_buds <- runif(num_draws)
runif_draws return(qnorm(runif_draws, mean = mean, sd = sqrt(var)))
}
<- rnorm_isi_buds(100000, mean = 0, var =1 )
my_norm_sample
# Looking at different sample sizes
par(mfrow=c(1,2))
hist(my_norm_sample)
hist(rnorm_isi_buds(40, mean=0,var=1))
Comparing it to rnorm
implemented in base R
:
qqplot(my_norm_sample, rnorm(100000,mean = 0,sd = 1))
abline(0,1)
(NOTE): the function we implemented takes variance
as an argument to parametrize the Normal distribution, while rnorm
takes sd
. Indeed:
qqplot(
rnorm_isi_buds(100000, mean = 0, var= 10),
rnorm(100000, mean = 0, sd= 10 )
)abline(0,1)
<- function(success_prob){
rbernoulli <- 0
return_value
if (runif(1) < success_prob){
<- 1 # else statement is not necessary, because we set return_value to 0 from the start
return_value
}
return(return_value)
}
Let’s test our function
<- 1000
num_sim <- numeric(num_sim)
my_coins <- 0.24
my_prob
for (i in 1:num_sim){
<- rbernoulli(my_prob)
my_coins[i]
}
mean(my_coins)
[1] 0.233
Modify the rbernoulli()
function so it has 2 arguments: the number of realizations to draw and success probability
Compare the mean number of successes, first from 10 realizations with probability of successs p = 0.2 and 10 realizations with probability of success p = 0.6 ; and then 1000 realizations with each of p=0.2 and p = 0.6. What do you observe?
<- function(num_draws, success_prob){
rbernoulli_draws <- c()
bernoulli_sample for(i in 1:num_draws){
<- rbernoulli(success_prob)
bernoulli_sample[i]
}return(bernoulli_sample)
}
<- rbernoulli_draws(10,0.2)
sample1_10 <- rbernoulli_draws(10,0.6)
sample2_10
paste0("Mean for sample size of 10, p = 0.2: ", mean(sample1_10))
[1] "Mean for sample size of 10, p = 0.2: 0.1"
paste0("Mean for sample size of 10, p = 0.6: ", mean(sample2_10))
[1] "Mean for sample size of 10, p = 0.6: 0.7"
<- rbernoulli_draws(1000,0.2)
sample1_1000 <- rbernoulli_draws(1000,0.6)
sample2_1000
paste0("Mean for sample size of 1000, p = 0.2: ", mean(sample1_1000))
[1] "Mean for sample size of 1000, p = 0.2: 0.187"
paste0("Mean for sample size of 1000, p = 0.6: ", mean(sample2_1000))
[1] "Mean for sample size of 1000, p = 0.6: 0.634"
Write a function to simulate from a discrete random variable taking values 1, 2, 3 with probabilities p1, p2, p3, supplied by the user of the function. Note: you could do it with a sample()
function, but we don’t want you to use it for this exercise.
<- function(p1, p2, p3){
rdiscr3 if (abs(1 - p1 - p2 - p3) > 10^-7) { # 10^-7 used to approximate 0
stop("probabilities must add to 1")
}
<- 3
return_value
<- runif(1)
my_unif
if (my_unif < p1) {
<- 1
return_value else if (my_unif < p1 + p2) {
} <- 2
return_value
}
return(return_value)
}
<- function(num_draws, p1, p2, p3){
rdiscr3_ndraws <- c()
random_sample for(i in 1:num_draws){
<- rdiscr3(p1,p2,p3)
random_sample[i]
}return(random_sample)
}
The code below should help you test your function. Name your function rdiscr3
<- 10000
num_sim <- rdiscr3_ndraws(num_draws = num_sim, 0.1, 0.2, 0.7)
test_vector # Calculate empirical proportions
sum(ifelse(test_vector == 1, yes = 1, no = 0)) / num_sim
[1] 0.099
sum(ifelse(test_vector == 2, yes = 1, no = 0)) / num_sim
[1] 0.2014
sum(ifelse(test_vector == 3, yes = 1, no = 0)) / num_sim
[1] 0.6996
For many common distributions, you can use base R
or common packages to simulate random samples. Generate 1000 samples and show the results using an appropriate plot from :
<- rgamma(10000,3,4)
gamma_draws hist(gamma_draws, main = "Random Sample from Gamma(3,4)")
<- rbeta(10000, 5,6)
beta_draws hist(beta_draws, main = "Random Sample from Beta(5,6)")
<- rpois(10000,5)
poisson_draws # discrete distribution
barplot(table(poisson_draws), main = "Random Sample from Poisson(5)")