Fitting Exercise

Load the necessary packages

library(dplyr)

Warning: package 'dplyr' was built under R version 4.5.2


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(ggplot2)

Warning: package 'ggplot2' was built under R version 4.5.2

library(tidyverse)

Warning: package 'tibble' was built under R version 4.5.2

Warning: package 'tidyr' was built under R version 4.5.2

Warning: package 'readr' was built under R version 4.5.2

Warning: package 'purrr' was built under R version 4.5.2

Warning: package 'lubridate' was built under R version 4.5.2

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ lubridate 1.9.5     ✔ tibble    3.3.1
✔ purrr     1.2.1     ✔ tidyr     1.3.2
✔ readr     2.2.0

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr)
library(tidymodels)

── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
✔ broom        1.0.12     ✔ rsample      1.3.2 
✔ dials        1.4.2      ✔ tailor       0.1.0 
✔ infer        1.1.0      ✔ tune         2.0.1 
✔ modeldata    1.5.1      ✔ workflows    1.3.0 
✔ parsnip      1.4.1      ✔ workflowsets 1.1.1 
✔ recipes      1.3.1      ✔ yardstick    1.3.2

Warning: package 'broom' was built under R version 4.5.2

Warning: package 'infer' was built under R version 4.5.2

Warning: package 'parsnip' was built under R version 4.5.2

Warning: package 'rsample' was built under R version 4.5.2

── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()

library(broom.mixed)

Warning: package 'broom.mixed' was built under R version 4.5.2

library(dotwhisker)
library(Metrics)


Attaching package: 'Metrics'

The following objects are masked from 'package:yardstick':

    accuracy, mae, mape, mase, precision, recall, rmse, smape

Data Processing and Exploration

##Read in datav
mavo <- read_csv("Mavoglurant_A2121_nmpk.csv")

Rows: 2678 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
dbl (17): ID, CMT, EVID, EVI2, MDV, DV, LNDV, AMT, TIME, DOSE, OCC, RATE, AG...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

##DV vs TIME plot 
ggplot(mavo, aes(x = TIME, y = DV, group = ID)) + 
    geom_line() +
    geom_point() +
    facet_wrap(~ DOSE, scales = "fixed")

##Removing OCC = 2
mavo <- mavo %>% filter(OCC == 1)

##Compute sum of DV for each individual, Exclude TIME = 0
mavo_exclude <- mavo %>% filter(TIME != 0) %>%
    group_by(ID) %>%
    summarize(Y = sum(DV, na.rm = TRUE))

##Only include TIME == 0
mavo_include <- mavo %>% filter(TIME == 0) 

##Join the two dataframes 
mavo_merged <- full_join(mavo_exclude, mavo_include, by = "ID")

##Convert RACE and SEX to factors, keeps only: Y, DOSE, AGE, SEX, RACE, WT, HT
mavo_final <- mavo_merged %>% mutate(RACE = factor(RACE), SEX = factor(SEX)) %>%
    select(Y, DOSE, AGE, SEX, RACE, WT, HT)

More exploring

##Just so basic scatterplots and box plots
ggplot(mavo_final, aes(x = AGE, y = Y)) + 
    geom_point()    #looks like no relationship?

ggplot(mavo_final, aes(x = SEX, y = Y)) + 
    geom_boxplot()  #they don't look too different?

ggplot(mavo_final, aes(x = RACE, y = Y)) + 
    geom_boxplot()  #they don't look too different?

ggplot(mavo_final, aes(x = WT, y = Y)) + 
    geom_point()    #looks like no relationship?

ggplot(mavo_final, aes(x = HT, y = Y)) + 
    geom_point()    #looks like no relationship?

ggplot(mavo_final, aes(x = DOSE, y = Y, group = DOSE)) + 
    geom_boxplot()  #this doesn't surprise me, looks proportional?

Model Fitting

##Y as outcome, DOSE as predictor
lm_yd <- linear_reg()
lm_yd_fit <- lm_yd %>% fit(Y ~ DOSE, data = mavo_final)
tidy(lm_yd_fit)

# A tibble: 2 × 5
  term        estimate std.error statistic  p.value
  <chr>          <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)    323.     199.        1.62 1.07e- 1
2 DOSE            58.2      5.19     11.2  2.69e-20

tidy(lm_yd_fit) %>% dwplot()

Warning: Using the `size` aesthetic with geom_segment was deprecated in ggplot2 3.4.0.
ℹ Please use the `linewidth` aesthetic instead.
ℹ The deprecated feature was likely used in the dotwhisker package.
  Please report the issue at <https://github.com/fsolt/dotwhisker/issues>.

mean_pred_yd <- predict(lm_yd_fit, new_data = mavo_final)
conf_int_pred_yd <- predict(lm_yd_fit, new_data = mavo_final, type = "conf_int")
predict_yd <- mavo_final %>% bind_cols(mean_pred_yd) %>% 
    bind_cols(conf_int_pred_yd)
ggplot(predict_yd, aes(x = DOSE)) + 
    geom_point(aes(y = .pred)) + 
    geom_errorbar(aes(ymin = .pred_lower, ymax = .pred_upper))

metrics_yd <- predict_yd %>% metrics(truth = Y, estimate = .pred) %>%
  filter(.metric %in% c("rmse", "rsq")) #this part I had to get help from chatgpt to figure out
print(metrics_yd)

# A tibble: 2 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard     666.   
2 rsq     standard       0.516

The RMSE for this model is 666.46, meaning it’s predictions of Y are typically about 666.46 units away from the true Y measured. The R-squared is 0.516, meaning the explains only 51.6% of the variation in Y. Typically, i would say that this means it’s not a great model, but I don’t even know anymore.

##Y as outcome, all variables as predictors
lm_y_all <- linear_reg()
lm_y_all_fit <- lm_y_all %>% fit(Y ~ DOSE + AGE + SEX + RACE + WT + HT, data = mavo_final)
tidy(lm_y_all_fit)

# A tibble: 9 × 5
  term        estimate std.error statistic  p.value
  <chr>          <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)  3387.     1835.       1.85  6.76e- 2
2 DOSE           59.9       4.88    12.3   2.05e-22
3 AGE             3.16      7.82     0.403 6.88e- 1
4 SEX2         -358.      217.      -1.65  1.02e- 1
5 RACE2         155.      129.       1.21  2.31e- 1
6 RACE7        -405.      448.      -0.904 3.68e- 1
7 RACE88        -53.5     245.      -0.219 8.27e- 1
8 WT            -23.0       6.40    -3.60  4.71e- 4
9 HT           -748.     1104.      -0.678 4.99e- 1

tidy(lm_y_all_fit) %>% dwplot()

mean_pred_y_all <- predict(lm_y_all_fit, new_data = mavo_final)
conf_int_pred_y_all <- predict(lm_y_all_fit, new_data = mavo_final, type = "conf_int")
predict_y_all <- mavo_final %>% bind_cols(mean_pred_y_all) %>% 
    bind_cols(conf_int_pred_y_all)
metrics_y_all <- predict_y_all %>% metrics(truth = Y, estimate = .pred) %>%
  filter(.metric %in% c("rmse", "rsq")) 
print(metrics_y_all)

# A tibble: 2 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard     591.   
2 rsq     standard       0.619

The RMSE for this model is 590.85, meaning it’s predictions of Y are typically about 590.85 units away from the true Y measured. The R-squared is 0.619, meaning the explains only 61.9% of the variation in Y. Typically, i would say that this means it’s not a great model, but it does seem to explain more and have closer predictions that the previous model, which makes sense because this one account for the effects of all possible predictors.

##SEX as outcome, DOSE as predictor
log_sex_dose <- logistic_reg() %>% set_engine("glm")
log_sex_dose_fit <- log_sex_dose %>% fit(SEX ~ DOSE, data = mavo_final)
tidy(log_sex_dose_fit)

# A tibble: 2 × 5
  term        estimate std.error statistic p.value
  <chr>          <dbl>     <dbl>     <dbl>   <dbl>
1 (Intercept)  -0.765     0.854     -0.896   0.370
2 DOSE         -0.0318    0.0243    -1.31    0.192

tidy(log_sex_dose_fit) %>% dwplot()

class_pred_sex_dose <- predict(log_sex_dose_fit, new_data = mavo_final, type = "class")
prob_pred_sex_dose <- predict(log_sex_dose_fit, new_data = mavo_final, type = "prob")
predict_sex_dose <- mavo_final %>% bind_cols(class_pred_sex_dose) %>%
    bind_cols(prob_pred_sex_dose)
event_class <- levels(mavo_final$SEX)[2]  
event_prob_col <- rlang::sym(paste0(".pred_", event_class))
metrics_sex_dose <- yardstick::metric_set(yardstick::accuracy, yardstick::roc_auc)(predict_sex_dose, truth = SEX, estimate = .pred_class, !!event_prob_col, event_level = "second") #this part I had to get help from chatgpt to figure out
print(metrics_sex_dose)

# A tibble: 2 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.867
2 roc_auc  binary         0.592

The model’s accuracy is 0.867, meaning 86.7% of observations were classified correctly. The ROC AUC is 0.592, indicating a pretty weak ability to distinguish between the two SEX categories, which is interesting.

##SEX as outcome, all variables as predictors
log_sex_all <- logistic_reg() %>% set_engine("glm")
log_sex_all_fit <- log_sex_dose %>% fit(SEX ~ DOSE + AGE + Y + RACE + WT + HT, data = mavo_final)
tidy(log_sex_all_fit)

# A tibble: 9 × 5
  term         estimate std.error statistic  p.value
  <chr>           <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)  60.3     18.0         3.34   0.000824
2 DOSE         -0.0308   0.0776     -0.396  0.692   
3 AGE           0.0834   0.0607      1.37   0.170   
4 Y            -0.00104  0.000963   -1.08   0.280   
5 RACE2        -1.93     1.37       -1.40   0.161   
6 RACE7         0.118    3.85        0.0306 0.976   
7 RACE88       -1.50     2.19       -0.683  0.494   
8 WT           -0.0628   0.0794     -0.791  0.429   
9 HT          -33.2     11.1        -3.00   0.00274

tidy(log_sex_all_fit) %>% dwplot()

class_pred_sex_all <- predict(log_sex_all_fit, new_data = mavo_final, type = "class")
prob_pred_sex_all <- predict(log_sex_all_fit, new_data = mavo_final, type = "prob")
predict_sex_all <- mavo_final %>% bind_cols(class_pred_sex_all) %>%
    bind_cols(prob_pred_sex_all)
event_class <- levels(mavo_final$SEX)[2]  
event_prob_col <- rlang::sym(paste0(".pred_", event_class))
metrics_sex_all <- yardstick::metric_set(yardstick::accuracy, yardstick::roc_auc)(predict_sex_all, truth = SEX, estimate = .pred_class, !!event_prob_col, event_level = "second")
print(metrics_sex_all)

# A tibble: 2 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.942
2 roc_auc  binary         0.980

The model’s accuracy is 0.942, meaning 94.2% of observations were classified correctly. The ROC AUC is 0.980, indicating a good ability to distinguish between the two SEX categories, which is significantly better than the previous model, which normally I would say makes sense since we are accounting for all the predictors, but with the outcome being SEX, I’m not really sure what this means.