### libraries library(tidymodels) library(discrim) library(ISLR) library(ggplot2) library(dplyr) ## Stock Market Data ---- # 1. Load and explore (through numerical and graphical summaries) the `Smarket` data (this is in `ISLR` package). head(Smarket) ## Any relationship between Direction and Lag over time? Smarket |> pivot_longer(Lag1:Lag5, names_to = "Lag", values_to = "value") |> separate(Lag, into = c("junk", "Lag"), sep = "g") |> select(-junk) |> mutate(Lag = as.numeric(Lag)) |> ggplot() + geom_boxplot(aes(Year, value, fill = Direction)) + facet_wrap(~Lag) ## Doesn't look like much.... ## Logistic Regression ---- # 1. Fit a logistic regression model to predict `Direction` using `Lag1` through `Lag5` and `Volume`. Describe your results. logistic_spec <- logistic_reg() logistic_spec |> fit(Direction ~ ., data = Smarket |> select(-Year, -Today), family = "binomial") -> m0.fit m0.fit |> pluck("fit") |> summary() # 2. Create a confusion matrix for the training data. m0.fit |> augment(new_data = Smarket) |> conf_mat(truth = Direction, estimate = .pred_class) # 3. What is the overall error rate of the model? m0.fit |> augment(new_data = Smarket) |> accuracy(truth = Direction, estimate = .pred_class) |> mutate(error = 1 - .estimate) |> pull(error) ## This is barely better than flipping a coin. # 4. Create two data sets, `train` and `test` that correspond to the observations from 2001 to 2004 (`train`) and 2005 (`test`). train <- Smarket |> filter(Year <= 2004) test <- Smarket |> filter(Year == 2005) # 5. Repeat 1-3, but obtain the test confusion matrix and error rate. logistic_spec |> fit(Direction ~ ., data = train |> select(-Year, -Today), family = "binomial") |> augment(new_data = test) -> m0.test_res m0.test_res |> conf_mat(truth = Direction, estimate = .pred_class) m0.test_res |> accuracy(truth = Direction, estimate = .pred_class) |> mutate(error = 1 - .estimate) |> pull(error) ## Now our error rate is worse than coin flipping. # 6. Repeat 5, but with a model of `Direction` on `Lag1` and `Lag2` only. logistic_spec |> fit(Direction ~ Lag1 + Lag2, data = train, family = "binomial") |> augment(new_data = test) -> m1.test_res m1.test_res |> conf_mat(truth = Direction, estimate = .pred_class) m1.test_res |> accuracy(truth = Direction, estimate = .pred_class) |> mutate(error = 1 - .estimate) |> pull(error) ## Similar results. ## LDA ---- # 1. Fit a linear discriminant analysis model to the `train` data set you created in the previous section with `Direction` as the response and `Lag1` and `Lag2` as the predictors. lda_spec <- discrim_linear() lda_spec |> fit(Direction ~ Lag1 + Lag2, data = train) -> m2.fit # 2. What are the values for $\hat{\pi}_1$ and $\hat{\pi}_2$? m2.fit$fit$prior # 3. Create a confusion matrix for the `test` data. m2.fit |> augment(new_data = test) -> m2.test_res m2.test_res |> conf_mat(truth = Direction, estimate = .pred_class) # 4. What is the test error rate? m2.test_res |> accuracy(truth = Direction, estimate = .pred_class) |> mutate(error = 1 - .estimate) |> pull(error) ## QDA ---- # 1. Fit a quadratic discriminant analysis model to the `train` data set you created in the previous section with `Direction` as the response and `Lag1` and `Lag2` as the predictors. qda_spec <- discrim_quad() qda_spec |> fit(Direction ~ Lag1 + Lag2, data = train) -> m3.fit # 2. Create a confusion matrix for the `test` data. m3.fit |> augment(new_data = test) -> m3.test_res m3.test_res |> conf_mat(truth = Direction, estimate = .pred_class) # 3. What is the test error rate? m3.test_res |> accuracy(truth = Direction, estimate = .pred_class) |> mutate(error = 1 - .estimate) |> pull(error) ## KNN ---- # 1. Fit a KNN model with $K = 1$ to the `train` data set you created in the previous section with `Direction` as the response and `Lag1` and `Lag2` as the predictors. knn1_spec <- nearest_neighbor(mode = "classification", neighbors = 1) knn1_spec |> fit(Direction ~ Lag1 + Lag2, data = train) -> m4.fit # 2. Create a confusion matrix for the `test` data. m4.fit |> augment(new_data = test) -> m4.test_res m4.test_res |> conf_mat(truth = Direction, estimate = .pred_class) # 3. What is the test error rate? m4.test_res |> accuracy(truth = Direction, estimate = .pred_class) |> mutate(error = 1 - .estimate) |> pull(error) # 4. Repeat 1.-3. with $K = 3$ and $K = 5$. knn3_spec <- nearest_neighbor(mode = "classification", neighbors = 3) knn3_spec |> fit(Direction ~ Lag1 + Lag2, data = train) -> m5.fit m5.fit |> augment(new_data = test) -> m5.test_res m5.test_res |> conf_mat(truth = Direction, estimate = .pred_class) m5.test_res |> accuracy(truth = Direction, estimate = .pred_class) |> mutate(error = 1 - .estimate) |> pull(error) knn5_spec <- nearest_neighbor(mode = "classification", neighbors = 5) knn5_spec |> fit(Direction ~ Lag1 + Lag2, data = train) -> m6.fit m6.fit |> augment(new_data = test) -> m6.test_res m6.test_res |> conf_mat(truth = Direction, estimate = .pred_class) m6.test_res |> accuracy(truth = Direction, estimate = .pred_class) |> mutate(error = 1 - .estimate) |> pull(error) #Of all the models you fit today, which would you pick to predict values of `Direction` and why? ## QDA appears to give the "best" test error rate, indicating a good mix of flexibility.