Summarizing Data with Numbers

ISI-BUDS 2023

head(fastfood)
# A tibble: 6 × 17
  restaurant item       calories cal_fat total_fat sat_fat trans_fat cholesterol
  <fct>      <chr>         <dbl>   <dbl>     <dbl>   <dbl>     <dbl>       <dbl>
1 Mcdonalds  Artisan G…      380      60         7       2       0            95
2 Mcdonalds  Single Ba…      840     410        45      17       1.5         130
3 Mcdonalds  Double Ba…     1130     600        67      27       3           220
4 Mcdonalds  Grilled B…      750     280        31      10       0.5         155
5 Mcdonalds  Crispy Ba…      920     410        45      12       0.5         120
6 Mcdonalds  Big Mac         540     250        28      10       1            80
# ℹ 9 more variables: sodium <dbl>, total_carb <dbl>, fiber <dbl>, sugar <dbl>,
#   protein <dbl>, vit_a <dbl>, vit_c <dbl>, calcium <dbl>, salad <chr>

tail(fastfood)
# A tibble: 6 × 17
  restaurant item       calories cal_fat total_fat sat_fat trans_fat cholesterol
  <fct>      <chr>         <dbl>   <dbl>     <dbl>   <dbl>     <dbl>       <dbl>
1 Taco Bell  Original …      700     270        30       9       0.5          45
2 Taco Bell  Spicy Tri…      780     340        38      10       0.5          50
3 Taco Bell  Express T…      580     260        29       9       1            60
4 Taco Bell  Fiesta Ta…      780     380        42      10       1            60
5 Taco Bell  Fiesta Ta…      720     320        35       7       0            70
6 Taco Bell  Fiesta Ta…      720     320        36       8       1            55
# ℹ 9 more variables: sodium <dbl>, total_carb <dbl>, fiber <dbl>, sugar <dbl>,
#   protein <dbl>, vit_a <dbl>, vit_c <dbl>, calcium <dbl>, salad <chr>

Number of Observations

nrow(fastfood)
[1] 515

Number of Variables

ncol(fastfood)
[1] 17

glimpse(fastfood)
Rows: 515
Columns: 17
$ restaurant  <fct> Mcdonalds, Mcdonalds, Mcdonalds, Mcdonalds, Mcdonalds, Mcd…
$ item        <chr> "Artisan Grilled Chicken Sandwich", "Single Bacon Smokehou…
$ calories    <dbl> 380, 840, 1130, 750, 920, 540, 300, 510, 430, 770, 380, 62…
$ cal_fat     <dbl> 60, 410, 600, 280, 410, 250, 100, 210, 190, 400, 170, 300,…
$ total_fat   <dbl> 7, 45, 67, 31, 45, 28, 12, 24, 21, 45, 18, 34, 20, 34, 8, …
$ sat_fat     <dbl> 2.0, 17.0, 27.0, 10.0, 12.0, 10.0, 5.0, 4.0, 11.0, 21.0, 4…
$ trans_fat   <dbl> 0.0, 1.5, 3.0, 0.5, 0.5, 1.0, 0.5, 0.0, 1.0, 2.5, 0.0, 1.5…
$ cholesterol <dbl> 95, 130, 220, 155, 120, 80, 40, 65, 85, 175, 40, 95, 125, …
$ sodium      <dbl> 1110, 1580, 1920, 1940, 1980, 950, 680, 1040, 1040, 1290, …
$ total_carb  <dbl> 44, 62, 63, 62, 81, 46, 33, 49, 35, 42, 38, 48, 48, 67, 31…
$ fiber       <dbl> 3, 2, 3, 2, 4, 3, 2, 3, 2, 3, 2, 3, 3, 5, 2, 2, 3, 3, 5, 2…
$ sugar       <dbl> 11, 18, 18, 18, 18, 9, 7, 6, 7, 10, 5, 11, 11, 11, 6, 3, 1…
$ protein     <dbl> 37, 46, 70, 55, 46, 25, 15, 25, 25, 51, 15, 32, 42, 33, 13…
$ vit_a       <dbl> 4, 6, 10, 6, 6, 10, 10, 0, 20, 20, 2, 10, 10, 10, 2, 4, 6,…
$ vit_c       <dbl> 20, 20, 20, 25, 20, 2, 2, 4, 4, 6, 0, 10, 20, 15, 2, 6, 15…
$ calcium     <dbl> 20, 20, 50, 20, 20, 15, 10, 2, 15, 20, 15, 35, 35, 35, 4, …
$ salad       <chr> "Other", "Other", "Other", "Other", "Other", "Other", "Oth…

Review

mean : average of a list of data values \[\bar x = \frac{\Sigma_{i = 1}^{n} x_i}{n}\]

median: the middle value when the data are ordered

sample variance: average square distance from the mean. It measures variability of the data. \[s^2 = \frac{\Sigma_{i = 1}^{n} (x_i - \bar x )^2}{n-1}\]

sample standard deviation: average distance of a data value from the mean. Square root of the variance.

\[s = \sqrt{s^{2}} = \sqrt{\frac{\Sigma_{i = 1}^{n} (x_i - \bar x )^2}{n-1}}\]

Mean

mean : average of a list of data values \[\bar x = \frac{\Sigma_{i = 1}^{n} x_i}{n}\]

summarize(fastfood, 
          mean(calories))
# A tibble: 1 × 1
  `mean(calories)`
             <dbl>
1             531.
mean(fastfood$calories)
[1] 530.9126

Median

median: the middle value when the data are ordered

summarize(fastfood, 
          median(calories))
# A tibble: 1 × 1
  `median(calories)`
               <dbl>
1                490
median(fastfood$calories)
[1] 490

Variance

sample variance: average square distance from the mean. It measures variability of the data. \[s^2 = \frac{\Sigma_{i = 1}^{n} (x_i - \bar x )^2}{n-1}\]

summarize(fastfood, 
          var(calories))
# A tibble: 1 × 1
  `var(calories)`
            <dbl>
1          79770.
var(fastfood$calories)
[1] 79770.18

Standard Deviation

sample standard deviation: average distance of a data value from the mean. Square root of the variance. \[s = \sqrt{s^{2}} = \sqrt{\frac{\Sigma_{i = 1}^{n} (x_i - \bar x )^2}{n-1}}\]

summarize(fastfood, 
          sd(calories))
# A tibble: 1 × 1
  `sd(calories)`
           <dbl>
1           282.
sd(fastfood$calories)
[1] 282.4361

Minimum

summarize(fastfood, 
          min(calories))
# A tibble: 1 × 1
  `min(calories)`
            <dbl>
1              20
min(fastfood$calories)
[1] 20

Maximum

summarize(fastfood, 
          max(calories))
# A tibble: 1 × 1
  `max(calories)`
            <dbl>
1            2430
max(fastfood$calories)
[1] 2430

We can use multiple functions inside summarize().

summarize(fastfood, 
          mean(calories), 
          median(calories),
          var(calories),
          sd(calories),
          min(calories),
          max(calories))
# A tibble: 1 × 6
  `mean(calories)` `median(calories)` `var(calories)` `sd(calories)`
             <dbl>              <dbl>           <dbl>          <dbl>
1             531.                490          79770.           282.
# ℹ 2 more variables: `min(calories)` <dbl>, `max(calories)` <dbl>

We can use create variable names for the output (e.g. mean_cal).

summarize(fastfood, 
          mea_cal = mean(calories), 
          med_cal = median(calories),
          var_cal = var(calories),
          sd_cal = sd(calories),
          min_cal = min(calories),
          max_cal = max(calories))
# A tibble: 1 × 6
  mea_cal med_cal var_cal sd_cal min_cal max_cal
    <dbl>   <dbl>   <dbl>  <dbl>   <dbl>   <dbl>
1    531.     490  79770.   282.      20    2430

Three solutions to a single problem

What is the average of 4, 8, 16 approximately?

1.What is the average of 4, 8, 16 approximately?

2.What is the average of 4, 8, 16 approximately?

3.What is the average of 4, 8, 16 approximately?

Solution 1: Functions within Functions

c(4, 8, 16)
[1]  4  8 16

mean(c(4, 8, 16))
[1] 9.333333

round(mean(c(4, 8, 16)))
[1] 9

Problem with writing functions within functions

Things will get messy and more difficult to read and debug as we deal with more complex operations on data.

Solution 2: Creating Objects

numbers <- c(4, 8, 16)
numbers
[1]  4  8 16

avg_number <- mean(numbers)
avg_number
[1] 9.333333

round(avg_number)
[1] 9

Problem with creating many objects

We will end up with too many objects in Environment.

Solution 3: The (forward) Pipe Operator %>%

Shortcut:
Ctrl (Command) + Shift + M

c(4, 8, 16) %>% 
  mean() %>% 
  round()
[1] 9

Combine 4, 8, and 16 and then
Take the mean and then
Round the output

The output of the first function is the first argument of the second function.

Do you recall composite functions such as \(f \circ g(x)\)?

Now we have \(f \circ g \circ h (x)\) or round(mean(c(4, 8, 16)))

h(x) %>% 
  g() %>% 
  f()
c(4, 8, 16) %>% 
  mean() %>% 
  round()

We can use the pipe operator with the summarize() function.

fastfood %>% 
  summarize(mea_cal = mean(calories), 
          med_cal = median(calories),
          var_cal = var(calories),
          sd_cal = sd(calories),
          min_cal = min(calories),
          max_cal = max(calories))
# A tibble: 1 × 6
  mea_cal med_cal var_cal sd_cal min_cal max_cal
    <dbl>   <dbl>   <dbl>  <dbl>   <dbl>   <dbl>
1    531.     490  79770.   282.      20    2430

We so far have been summarizing data numerically. In grouped data, we often want to do this for each group (e.g. left vs. right handed people, first-years vs. sophomores vs. juniors vs. seniors. etc.).

group_by()

group_by() separates the data frame by the groups. Any action following group_by() will be completed for each group separately.

Notice that the output indicates Groups: restaurant [8]

fastfood %>% 
  group_by(restaurant)
# A tibble: 515 × 17
# Groups:   restaurant [8]
   restaurant item      calories cal_fat total_fat sat_fat trans_fat cholesterol
   <fct>      <chr>        <dbl>   <dbl>     <dbl>   <dbl>     <dbl>       <dbl>
 1 Mcdonalds  Artisan …      380      60         7       2       0            95
 2 Mcdonalds  Single B…      840     410        45      17       1.5         130
 3 Mcdonalds  Double B…     1130     600        67      27       3           220
 4 Mcdonalds  Grilled …      750     280        31      10       0.5         155
 5 Mcdonalds  Crispy B…      920     410        45      12       0.5         120
 6 Mcdonalds  Big Mac        540     250        28      10       1            80
 7 Mcdonalds  Cheesebu…      300     100        12       5       0.5          40
 8 Mcdonalds  Classic …      510     210        24       4       0            65
 9 Mcdonalds  Double C…      430     190        21      11       1            85
10 Mcdonalds  Double Q…      770     400        45      21       2.5         175
# ℹ 505 more rows
# ℹ 9 more variables: sodium <dbl>, total_carb <dbl>, fiber <dbl>, sugar <dbl>,
#   protein <dbl>, vit_a <dbl>, vit_c <dbl>, calcium <dbl>, salad <chr>

fastfood %>% 
  group_by(restaurant) %>% 
  summarize(med_calories = median(calories),
            mean_calories = mean(calories))
# A tibble: 8 × 3
  restaurant  med_calories mean_calories
  <fct>              <dbl>         <dbl>
1 Arbys                550          533.
2 Burger King          555          609.
3 Chick Fil-A          390          384.
4 Dairy Queen          485          520.
5 Mcdonalds            540          640.
6 Sonic                570          632.
7 Subway               460          503.
8 Taco Bell            420          444.