# R code for Batting Order Optimization Project - Gavin Rublewski (UConn '23) # TABLE OF CONTENTS # Section 1 - Instructions and Inputs # Section 2 - Monte Carlo Simulation Code # Section 3 - Saving Results # Section 4 - Hypothesis Tests # Section 5 - Examples of How to Use Hypothesis Tests #------------------------------------------------------------------------------- #----------------Section 1 - Instructions and Inputs---------------------------- # You must go through Section 1 very carefully - otherwise you will not get the desired results # Before running this code, make sure you have the raw data saved in a folder. # Please reference the prior year's data file to learn how the file must be set up # I would recommend running the code one line at a time # To run code, put cursor on a line then hit CTRL + ENTER # Alternatively, highlight the information you want to run and hit the "Run" button # Optional code to delete workspace - this allows you to reset the items in your environment # and re-run items remove(list=ls()) # Perform the following actions so that the rest of the code runs smoothly. # After completing these items, you should be able to run Section 2 with no issues # SET WORKING DIRECTORY - MODIFY THIS CODE SO THAT IT DESCRIBES YOUR FOLDER setwd("C:/Users/ghrub/OneDrive/Documents/UConn Baseball") # INTALL THE NECESSARY PACKAGE IF NOT YET INSTALLED. GET RID OF THE # SYMBOL THEN RUN THE CODE TO DO SO. #install.packages("combinat") # Reference the combinat library so we can use it later library(combinat) # INTALL THE NECESSARY PACKAGE IF NOT YET INSTALLED. GET RID OF THE # SYMBOL THEN RUN THE CODE TO DO SO. #install.packages("dplyr") # Reference the dplyr library so we can use it later library(dplyr) # The following code reads the raw data file into R # MODIFY THE NAME OF THE FILE TO MATCH THE ONE IN YOUR FOLDER UConnData <- read.csv("real_data_2_3_winterincluded.csv") # view a preview of the data frame - first 6 rows head(UConnData) # Notice that the data frame has a lot of information because it was required to calculate # probabilities in the Excel file. colnames(UConnData) # We do not need every column. So let's get rid of the unnecessary ones by creating a new, condensed data frame # THE DESIRED PROBABILITIES MAY BE LABELED DIFFERENTLY IN YOUR FILE, SO MODIFY THE NAMES OF SELECTED COLUMNS AS NECESSARY # UConnData is the data frame and the other items are column names - only adjust column names RequiredData <- select(UConnData, Player.Name, Adjusted.Probability.1B, Adjusted.Probability.2B, Adjusted.Probability.3B, Adjusted.Probability.Home, Adjusted.Probability.Out) # STATE WHO CAN PLAY EACH POSITION - TRY TO LIMIT THESE OPTIONS SO THAT THE CODE DOESN'T TAKE TOO LONG TO RUN Catcher <- c("Matt Garbowski", "Ryan Hyde") FirstBase <- c("Ben Huber", "Maddix Dalena") SecondBase <- c("Ryan Daniels", "David Smith") ThirdBase <- c("Dominic Freeberger", "Luke Broadhurst") Shortstop <- c("Bryan Padilla") LeftField <- c("Korey Morton", "Niko Brini", "Jake Studley") CenterField <- c("T.C. Simmons", "Korey Morton", "Niko Brini") RightField <- c("Jake Studley", "Korey Morton", "Niko Brini") DesignatedHitter <- c("Luke Broadhurst", "Ryan Daniels", "Maddix Dalena", "Matt Garbowski") # LIST THE PLAYERS WHO WILL DEFINITELY BE IN THE STARTING LINEUP NO MATTER WHAT Definite_Starters <- c("Ben Huber", "David Smith", "Bryan Padilla", "Dominic Freeberger", "T.C. Simmons", "Korey Morton") # FILL IN THE NAMES OF WHO CAN BAT AT EACH SPOT IN THE ORDER # DO NOT LET EVERYONE BAT EVERYWHERE - THAT IS TOO MANY OPTIONS TO TEST first_names <- c("David Smith", "Bryan Padilla", "T.C. Simmons") second_names <- c("David Smith", "Bryan Padilla", "T.C. Simmons", "Korey Morton", "Dominic Freeberger") third_names <- c("Ben Huber", "Dominic Freeberger", "Korey Morton") fourth_names <- c("Ben Huber", "Dominic Freeberger", "Korey Morton") fifth_names <- c("Korey Morton", "T.C. Simmons", "Luke Broadhurst") sixth_names <- c("Matt Garbowski", "Bryan Padilla", "Luke Broadhurst", "Maddix Dalena", "Ryan Daniels") seventh_names <- c("Matt Garbowski", "Ryan Hyde", "Bryan Padilla", "Niko Brini", "Jake Studley", "Maddix Dalena", "Ryan Daniels", "Luke Broadhurst") eigth_names <- c("Matt Garbowski", "Ryan Hyde", "Bryan Padilla", "Niko Brini", "Jake Studley", "Maddix Dalena", "Ryan Daniels") ninth_names <- c("Matt Garbowski", "Ryan Hyde", "Bryan Padilla", "Niko Brini", "Jake Studley", "Maddix Dalena", "Ryan Daniels") # INPUT PROBABILITY ASSUMPTIONS TO BE USED IN MODEL probrunneron2drivenhomebysingle <- 0.60 probrunneron1drivenhomebydouble <- 0.45 probrunneron2reaches3ondouble <- 0.10 # INPUT NUMBER OF SIMULATED GAMES PER LINEUP Number_Games_Simulated <- 100 # Now that we have set all of our inputs, you are free to run Section 2. # Then once you have the results, go to Section 3 and save the file - make sure to give an appropriate name # Section 4 is for analyzing the results using hypothesis tests. # If you find Section 4 confusing, Section 5 shows past analysis using hypothesis tests # that you may use as a reference #------------------------------------------------------------------------------- #------------------Section 2 - Monte Carlo Simulation Code---------------------- # the below shows how to sample from a probability distribution # sample(vector of categories, # to sample, probability of each category) is the syntax sample(0:4, size = 1, prob = c(0.4, 0.2, 0.1, 0.1, 0.2)) # Each time you run the above line of code, you get a new random answer # This will be used extensively later on to simulate at-bat outcomes # Create a skeleton data frame that is large enough to fit any 9-inning game # This skeleton will later be populated with simulated games SkeletonData <- data.frame(matrix(nrow = 81, ncol = 13)) # Next, name the columns in SkeletonData colnames(SkeletonData) <- c("Batter", "Prob1st", "Prob2nd", "Prob3rd", "ProbHome", "ProbOut", "Result", "Inning", "Outs", "Base Runner 1", "Base Runner 2", "Base Runner 3","Runs") # Next, we must obtain a list of all possible batting orders # first find all the reasonable combinations of 9 person lineups (unordered) # then for each 9 person combo, find each possible reasonable order # We first need all possible COMBINATIONS, and then create a matrix for each one, # where each matrix contains all possible PERMUTATIONS # then combine them together using the rbind function # Note - each player's assigned number is based on the row in which that player appears # in the RequiredData matrix # Let's consider all feasible lineup scenarios # STATE WHO CAN PLAY EACH POSITION - TRY TO LIMIT THESE OPTIONS SO THAT THE CODE DOESN'T TAKE TOO LONG TO RUN # Note: this code has been moved up to the top for ease of use #Catcher <- c("Matt Garbowski", "Ryan Hyde") #FirstBase <- c("Ben Huber", "Maddix Dalena") #SecondBase <- c("Ryan Daniels", "David Smith") #ThirdBase <- c("Dominic Freeberger", "Luke Broadhurst") #Shortstop <- c("Bryan Padilla") #LeftField <- c("Korey Morton", "Niko Brini", "Jake Studley") #CenterField <- c("T.C. Simmons", "Korey Morton", "Niko Brini") #RightField <- c("Jake Studley", "Korey Morton", "Niko Brini") #DesignatedHitter <- c("Luke Broadhurst", "Ryan Daniels", "Maddix Dalena", "Matt Garbowski") # Match up each player name with a particular numerical code to make the coding easier NumberLookupTable <- data.frame(Number_for_Player = 1:22, Player_Name = RequiredData$Player.Name) # Save this table to your computer as both an RDS file and an Excel file - delete # symbol and run to do so # If you want to see this in R, just click on the RDS file in the Files tab on the right of the R screen #saveRDS(NumberLookupTable, "NumberLookupTable.rds") #writexl::write_xlsx(NumberLookupTable, "NumberLookupTable.xlsx") # Adjust the position vectors to get them into the version we need them in # how it works - for each position vector, match the names up to their assigned numbers C_Numbers <- vector(length = length(Catcher)) First_Numbers <- vector(length = length(FirstBase)) Second_Numbers <- vector(length = length(SecondBase)) Third_Numbers <- vector(length = length(ThirdBase)) Short_Numbers <- vector(length = length(Shortstop)) LF_Numbers <- vector(length = length(LeftField)) CF_Numbers <- vector(length = length(CenterField)) RF_Numbers <- vector(length = length(RightField)) DH_Numbers <- vector(length = length(DesignatedHitter)) for (i in 1:length(C_Numbers)) { C_Numbers[i] <- match(Catcher[i], NumberLookupTable[,2]) } for (i in 1:length(First_Numbers)) { First_Numbers[i] <- match(FirstBase[i], NumberLookupTable[,2]) } for (i in 1:length(Second_Numbers)) { Second_Numbers[i] <- match(SecondBase[i], NumberLookupTable[,2]) } for (i in 1:length(Third_Numbers)) { Third_Numbers[i] <- match(ThirdBase[i], NumberLookupTable[,2]) } for (i in 1:length(Short_Numbers)) { Short_Numbers[i] <- match(Shortstop[i], NumberLookupTable[,2]) } for (i in 1:length(LF_Numbers)) { LF_Numbers[i] <- match(LeftField[i], NumberLookupTable[,2]) } for (i in 1:length(CF_Numbers)) { CF_Numbers[i] <- match(CenterField[i], NumberLookupTable[,2]) } for (i in 1:length(RF_Numbers)) { RF_Numbers[i] <- match(RightField[i], NumberLookupTable[,2]) } for (i in 1:length(DH_Numbers)) { DH_Numbers[i] <- match(DesignatedHitter[i], NumberLookupTable[,2]) } # find all possible 9 person lineup combos given the restrictions # Find all possible combinations of the lineups - problem - repeat entries included in many 9 person combos combination_grid <- expand.grid(C_Numbers, First_Numbers, Second_Numbers, Short_Numbers, Third_Numbers, LF_Numbers, RF_Numbers, CF_Numbers, DH_Numbers) # Change mode to matrix combination_grid <- as.matrix(combination_grid) # create a testing column that will help us eliminate options in which a player is repeated in the same lineup combination_grid <- mutate(as.data.frame(combination_grid), TestingColumn = "tmp") # Determine number of unique players in each row - if less than 9, we have a problem and must delete the row for (i in 1:nrow(combination_grid)) { combination_grid[i,10] <- length(unique(as.numeric(combination_grid[i,1:9]))) } # select only the rows in which there are 9 unique players listed combination_grid <- filter(as.data.frame(combination_grid), TestingColumn == 9) # Now cut down the lineups further to make sure all definite starters are included # LIST THE PLAYERS WHO WILL DEFINITELY BE IN THE STARTING LINEUP NO MATTER WHAT # Note - this code has been moved up to the top for ease of use #Definite_Starters <- c("Ben Huber", "David Smith", "Bryan Padilla", "Dominic Freeberger", "T.C. Simmons", "Korey Morton") # Convert the text names to their numerical codes Definite_Starters_Numbers <- vector(length = length(Definite_Starters)) for (i in 1:length(Definite_Starters_Numbers)) { Definite_Starters_Numbers[i] <- match(Definite_Starters[i], NumberLookupTable[,2]) } # Filter on only the options that have all definite starters included combination_grid <- mutate(as.data.frame(combination_grid), DefStartTest = "tmp") for (i in 1:nrow(combination_grid)) { combination_grid[i,11] <- all(Definite_Starters_Numbers %in% as.numeric(combination_grid[i,1:9])) } combination_grid <- filter(combination_grid, DefStartTest == TRUE) # Eliminate unnecessary columns combination_grid <- select(combination_grid, -DefStartTest, -TestingColumn) # Set column names for combination_grid colnames(combination_grid) <- c("C", "1B", "2B", "SS", "3B", "LF", "RF", "CF", "DH") # Let's make a new version of this that has player names # The following code converts a matrix of player numbers to a matrix of player names combination_grid_Text <- combination_grid for (j in 1:9) { for (i in 1:nrow(combination_grid_Text)) { combination_grid_Text[i,j] = NumberLookupTable[as.numeric(combination_grid_Text[i,j]),2] } } # we now have all combinations - there are not too many of them (28 in 2023) - the next step is to find all reasonable permutations # how many options is that? for each combination, there are 9! = 362,880 # this is wayyy too big - so let's try to reduce it a bit # But first let's start with finding all possibilities, and we will # quickly narrow things down from there # Construct all permutations of the 28 (in 2023) lineups # this code may take a minute to run permutations_expanded <- matrix(unlist(permn(as.numeric(combination_grid[1,]))), nrow = length(permn(as.numeric(combination_grid[1,]))), byrow = TRUE) for (i in 2:nrow(combination_grid)) { permutations_expanded <- rbind(permutations_expanded, matrix(unlist(permn(as.numeric(combination_grid[i,]))),nrow = length(permn(as.numeric(combination_grid[i,]))), byrow = TRUE)) } # turn permutations_expanded into a data frame and set the column names of the permutations_expanded permutations_expanded <- as.data.frame(permutations_expanded) colnames(permutations_expanded) <- c("first","second","third","fourth","fifth","sixth","seventh","eigth","ninth") # We have some repeated rows due to the fact that some 9 person combinations were the same # 9 people, just in different positions # Let's get rid of repeat rows permutations_expanded <- distinct(permutations_expanded) # Now, permutations_expanded has less options (5,080,320 in 2023) - but we can narrow that down # FILL IN THE NAMES OF WHO CAN BAT AT EACH SPOT IN THE ORDER # DO NOT LET EVERYONE BAT EVERYWHERE - THAT IS TOO MANY OPTIONS TO TEST # Note: this code has been moved up to the top for ease of use #first_names <- c("David Smith", "Bryan Padilla", "T.C. Simmons") #second_names <- c("David Smith", "Bryan Padilla", "T.C. Simmons", "Korey Morton", "Dominic Freeberger") #third_names <- c("Ben Huber", "Dominic Freeberger", "Korey Morton") #fourth_names <- c("Ben Huber", "Dominic Freeberger", "Korey Morton") #fifth_names <- c("Korey Morton", "T.C. Simmons", "Luke Broadhurst") #sixth_names <- c("Matt Garbowski", "Bryan Padilla", "Luke Broadhurst", "Maddix Dalena", "Ryan Daniels") #seventh_names <- c("Matt Garbowski", "Ryan Hyde", "Bryan Padilla", "Niko Brini", "Jake Studley", "Maddix Dalena", "Ryan Daniels", "Luke Broadhurst") #eigth_names <- c("Matt Garbowski", "Ryan Hyde", "Bryan Padilla", "Niko Brini", "Jake Studley", "Maddix Dalena", "Ryan Daniels") #ninth_names <- c("Matt Garbowski", "Ryan Hyde", "Bryan Padilla", "Niko Brini", "Jake Studley", "Maddix Dalena", "Ryan Daniels") # Convert the names to numbers and subset the permutations data frame to reflect the above restrictions firstoptions <- vector(length = length(first_names)) secondoptions <- vector(length = length(second_names)) thirdoptions <- vector(length = length(third_names)) fourthoptions <- vector(length = length(fourth_names)) fifthoptions <- vector(length = length(fifth_names)) sixthoptions <- vector(length = length(sixth_names)) seventhoptions <- vector(length = length(seventh_names)) eigthoptions <- vector(length = length(eigth_names)) ninthoptions <- vector(length = length(ninth_names)) for (i in 1:length(firstoptions)) { firstoptions[i] <- match(first_names[i], NumberLookupTable[,2]) } for (i in 1:length(secondoptions)) { secondoptions[i] <- match(second_names[i], NumberLookupTable[,2]) } for (i in 1:length(thirdoptions)) { thirdoptions[i] <- match(third_names[i], NumberLookupTable[,2]) } for (i in 1:length(fourthoptions)) { fourthoptions[i] <- match(fourth_names[i], NumberLookupTable[,2]) } for (i in 1:length(fifthoptions)) { fifthoptions[i] <- match(fifth_names[i], NumberLookupTable[,2]) } for (i in 1:length(sixthoptions)) { sixthoptions[i] <- match(sixth_names[i], NumberLookupTable[,2]) } for (i in 1:length(seventhoptions)) { seventhoptions[i] <- match(seventh_names[i], NumberLookupTable[,2]) } for (i in 1:length(eigthoptions)) { eigthoptions[i] <- match(eigth_names[i], NumberLookupTable[,2]) } for (i in 1:length(ninthoptions)) { ninthoptions[i] <- match(ninth_names[i], NumberLookupTable[,2]) } permutations_condensed <- permutations_expanded[permutations_expanded$first %in% firstoptions,] permutations_condensed <- permutations_condensed[permutations_condensed$second %in% secondoptions,] permutations_condensed <- permutations_condensed[permutations_condensed$third %in% thirdoptions,] permutations_condensed <- permutations_condensed[permutations_condensed$fourth %in% fourthoptions,] permutations_condensed <- permutations_condensed[permutations_condensed$fifth %in% fifthoptions,] permutations_condensed <- permutations_condensed[permutations_condensed$sixth %in% sixthoptions,] permutations_condensed <- permutations_condensed[permutations_condensed$seventh %in% seventhoptions,] permutations_condensed <- permutations_condensed[permutations_condensed$eigth %in% eigthoptions,] permutations_condensed <- permutations_condensed[permutations_condensed$ninth %in% ninthoptions,] # Now we have 1,440 (in 2023) possible lineups - this is manageable # INPUT PROBABILITY ASSUMPTIONS TO BE USED IN MODEL # Note: this code has been moved up to the top for ease of use #probrunneron2drivenhomebysingle <- 0.60 #probrunneron1drivenhomebydouble <- 0.45 #probrunneron2reaches3ondouble <- 0.10 # INPUT NUMBER OF SIMULATED GAMES PER LINEUP # Note: this code has been moved up to the top for ease of use #Number_Games_Simulated <- 100 # The following code is used to run the game simulation and gather our results. # The code will take a very long time to run (around 10 hours given the current level of complexity) # To see how the code works, feel free to change the options for k to 1:2 and see the results, then change it back to the original setting # Run the start_time code, the loop, end_time code, and end_time - start_time code together at once # this will run the code and tell us how long it took # The code essentially does the following: # For each possible batting order, simulate 100 games (or however many you would like), # then find the run total for each game, and obtain the average runs and standard deviation of runs start_time <- Sys.time() for (k in 1:nrow(permutations_condensed)) { # Create a vector for the lineup being analyzed and additional vectors for the 1B, 2B, 3B, Home and Out probabilities for each player in the lineup BatterVector <- as.vector(RequiredData[as.numeric(permutations_condensed[k,1:9]),1]) Prob1Vector <- as.vector(RequiredData[as.numeric(permutations_condensed[k,1:9]),2]) Prob2Vector <- as.vector(RequiredData[as.numeric(permutations_condensed[k,1:9]),3]) Prob3Vector <- as.vector(RequiredData[as.numeric(permutations_condensed[k,1:9]),4]) Prob4Vector <- as.vector(RequiredData[as.numeric(permutations_condensed[k,1:9]),5]) ProbOutVector <- as.vector(RequiredData[as.numeric(permutations_condensed[k,1:9]),6]) # Start filling out the skeleton data frame with information - repeat the above vectors 9 times - creating 81 at-bats - more than any game would have SkeletonData[,1] <- rep(BatterVector, 9) SkeletonData[,2] <- rep(Prob1Vector, 9) SkeletonData[,3] <- rep(Prob2Vector, 9) SkeletonData[,4] <- rep(Prob3Vector, 9) SkeletonData[,5] <- rep(Prob4Vector, 9) SkeletonData[,6] <- rep(ProbOutVector, 9) # next, let's loop through many games and obtain the average runs and standard # deviation of runs set.seed(1) # RunTotalTracker tracks the run totals for each individual game RunTotalTracker <- as.data.frame(matrix(nrow = Number_Games_Simulated,ncol = 1)) colnames(RunTotalTracker) <- "Runs" for (j in 1:Number_Games_Simulated) { # Initialize a skeleton data frame that can be used for each iteration SkeletonDataTmpVersion <- SkeletonData # simulate a result for every at-bat in the data frame # Results are a number 0-4, representing out, reached first, reached second, reached third, and reached home, respectively for (i in 1:nrow(SkeletonDataTmpVersion)) { SkeletonDataTmpVersion$Result[i] <- sample(0:4, size = 1, prob = c(SkeletonDataTmpVersion$ProbOut[i], SkeletonDataTmpVersion$Prob1st[i], SkeletonDataTmpVersion$Prob2nd[i], SkeletonDataTmpVersion$Prob3rd[i], SkeletonDataTmpVersion$ProbHome[i])) } # complete the first row of the data frame - the first at-bat in the first inning # Logic applies as such - if the player gets out, increase outs to 1, keep all baserunners at base 0 and the run total at 0 # if a player gets a single, keep outs at zero, put BaseRunner 1 (the lead runner) on 1, keep other baserunners at 0, keep run total at zero # etc etc. SkeletonDataTmpVersion$Inning[1] <- 1 if (SkeletonDataTmpVersion$Result[1] == 0) { SkeletonDataTmpVersion[1,10:13] <- 0 SkeletonDataTmpVersion$Outs[1] <- 1 } else if (SkeletonDataTmpVersion$Result[1] == 1) { SkeletonDataTmpVersion[1,11:13] <- 0 SkeletonDataTmpVersion$`Base Runner 1`[1] <- 1 SkeletonDataTmpVersion$Outs[1] <- 0 } else if (SkeletonDataTmpVersion$Result[1] == 2) { SkeletonDataTmpVersion[1,11:13] <- 0 SkeletonDataTmpVersion$`Base Runner 1`[1] <- 2 SkeletonDataTmpVersion$Outs[1] <- 0 } else if (SkeletonDataTmpVersion$Result[1] == 3) { SkeletonDataTmpVersion[1,11:13] <- 0 SkeletonDataTmpVersion$`Base Runner 1`[1] <- 3 SkeletonDataTmpVersion$Outs[1] <- 0 } else { SkeletonDataTmpVersion$Runs[1] <- 1 SkeletonDataTmpVersion[1,9:12] <- 0 } # fill in the remainder of the game based on the simulated at-bat results, adjusting innings, outs, baserunners, runs as necessary for (i in 2:nrow(SkeletonDataTmpVersion)) { # if we just reached the third out in the 9th inning, keep runs constant for the remainder of the data frame. # Later, we will pull the runs from the last row, which should have the same # value as the runs when the game ended if (SkeletonDataTmpVersion$Inning[i-1] == 9 & SkeletonDataTmpVersion$Outs[i-1] == 3) { SkeletonDataTmpVersion$Inning[i] <- 9 SkeletonDataTmpVersion$Outs[i] <- 3 SkeletonDataTmpVersion$Runs[i] <- SkeletonDataTmpVersion$Runs[i-1] } else if (SkeletonDataTmpVersion$Result[i] == 0) { if (SkeletonDataTmpVersion$Outs[i-1] != 3) { SkeletonDataTmpVersion$Outs[i] <- 1 + SkeletonDataTmpVersion$Outs[i-1] SkeletonDataTmpVersion[i,10:13] <- SkeletonDataTmpVersion[i-1,10:13] SkeletonDataTmpVersion$Inning[i] <- SkeletonDataTmpVersion$Inning[i-1] } else { SkeletonDataTmpVersion$Outs[i] <- 1 SkeletonDataTmpVersion[i,10:12] <- 0 SkeletonDataTmpVersion$Runs[i] <- SkeletonDataTmpVersion$Runs[i-1] SkeletonDataTmpVersion$Inning[i] <- 1 + SkeletonDataTmpVersion$Inning[i-1] } } else if (SkeletonDataTmpVersion$Result[i] == 1) { if (SkeletonDataTmpVersion$Outs[i-1] != 3) { SkeletonDataTmpVersion$Inning[i] <- SkeletonDataTmpVersion$Inning[i-1] SkeletonDataTmpVersion$Outs[i] <- SkeletonDataTmpVersion$Outs[i-1] } else { SkeletonDataTmpVersion$Inning[i] <- 1 + SkeletonDataTmpVersion$Inning[i-1] SkeletonDataTmpVersion$Outs[i] <- 0 } if (SkeletonDataTmpVersion$`Base Runner 1`[i-1] == 0 | SkeletonDataTmpVersion$Outs[i-1] == 3) { SkeletonDataTmpVersion$`Base Runner 1`[i] <- 1 SkeletonDataTmpVersion[i,11:12] <- 0 SkeletonDataTmpVersion$Runs[i] <- SkeletonDataTmpVersion$Runs[i-1] } else if (SkeletonDataTmpVersion$`Base Runner 1`[i-1] == 1) { SkeletonDataTmpVersion$`Base Runner 2`[i] <- 1 SkeletonDataTmpVersion$`Base Runner 1`[i] <- 2 SkeletonDataTmpVersion$`Base Runner 3`[i] <- 0 SkeletonDataTmpVersion$Runs[i] <- SkeletonDataTmpVersion$Runs[i-1] } else if (SkeletonDataTmpVersion$`Base Runner 1`[i-1] == 2 & SkeletonDataTmpVersion$`Base Runner 2`[i-1] == 0) { # this is one of those times when we use the sample function to move baserunners according to probabilities specified above the start of the loop SkeletonDataTmpVersion$`Base Runner 1`[i] <- sample(c(3,1), size = 1, prob = c(1- probrunneron2drivenhomebysingle, probrunneron2drivenhomebysingle)) ################### SkeletonDataTmpVersion$`Base Runner 2`[i] <- ifelse(SkeletonDataTmpVersion$`Base Runner 1`[i] == 3, 1, 0) SkeletonDataTmpVersion$`Base Runner 3`[i] <- 0 SkeletonDataTmpVersion$Runs[i] <- ifelse(SkeletonDataTmpVersion$`Base Runner 1`[i] == 3, SkeletonDataTmpVersion$Runs[i-1], SkeletonDataTmpVersion$Runs[i-1] + 1) } else if (SkeletonDataTmpVersion$`Base Runner 1`[i-1] == 2 & SkeletonDataTmpVersion$`Base Runner 2`[i-1] == 1) { SkeletonDataTmpVersion$`Base Runner 1`[i] <- sample(c(3,2), size = 1, prob = c(1- probrunneron2drivenhomebysingle, probrunneron2drivenhomebysingle)) ################### SkeletonDataTmpVersion$`Base Runner 2`[i] <- ifelse(SkeletonDataTmpVersion$`Base Runner 1`[i] == 3, 2, 1) SkeletonDataTmpVersion$`Base Runner 3`[i] <- ifelse(SkeletonDataTmpVersion$`Base Runner 1`[i] == 3, 1, 0) SkeletonDataTmpVersion$Runs[i] <- ifelse(SkeletonDataTmpVersion$`Base Runner 1`[i] == 3, SkeletonDataTmpVersion$Runs[i-1], SkeletonDataTmpVersion$Runs[i-1] + 1) } else if (SkeletonDataTmpVersion$`Base Runner 1`[i-1] == 3 & SkeletonDataTmpVersion$`Base Runner 2`[i-1] == 0) { SkeletonDataTmpVersion$`Base Runner 1`[i] <- 1 SkeletonDataTmpVersion[i,11:12] <- 0 SkeletonDataTmpVersion$Runs[i] <- 1 + SkeletonDataTmpVersion$Runs[i-1] } else if (SkeletonDataTmpVersion$`Base Runner 1`[i-1] == 3 & SkeletonDataTmpVersion$`Base Runner 2`[i-1] == 1) { SkeletonDataTmpVersion$`Base Runner 1`[i] <- 2 SkeletonDataTmpVersion$`Base Runner 2`[i] <- 1 SkeletonDataTmpVersion$`Base Runner 3`[i] <- 0 SkeletonDataTmpVersion$Runs[i] <- 1 + SkeletonDataTmpVersion$Runs[i-1] } else if (SkeletonDataTmpVersion$`Base Runner 1`[i-1] == 3 & SkeletonDataTmpVersion$`Base Runner 2`[i-1] == 2 & SkeletonDataTmpVersion$`Base Runner 3`[i-1] == 0) { SkeletonDataTmpVersion$`Base Runner 1`[i] <- sample(c(3,1), size = 1, prob = c(1 - probrunneron2drivenhomebysingle, probrunneron2drivenhomebysingle)) ##################### SkeletonDataTmpVersion$`Base Runner 2`[i] <- ifelse(SkeletonDataTmpVersion$`Base Runner 1`[i] == 3, 1, 0) SkeletonDataTmpVersion$`Base Runner 3`[i] <- 0 SkeletonDataTmpVersion$Runs[i] <- ifelse(SkeletonDataTmpVersion$`Base Runner 1`[i] == 3, 1 + SkeletonDataTmpVersion$Runs[i-1], 2 + SkeletonDataTmpVersion$Runs[i-1]) } else if (SkeletonDataTmpVersion$`Base Runner 3`[i-1] == 1) { SkeletonDataTmpVersion$`Base Runner 1`[i] <- sample(c(3,2), size = 1, prob = c(1 - probrunneron2drivenhomebysingle, probrunneron2drivenhomebysingle)) ##################### SkeletonDataTmpVersion$`Base Runner 2`[i] <- ifelse(SkeletonDataTmpVersion$`Base Runner 1`[i] == 3, 2, 1) SkeletonDataTmpVersion$`Base Runner 3`[i] <- ifelse(SkeletonDataTmpVersion$`Base Runner 1`[i] == 3, 1, 0) SkeletonDataTmpVersion$Runs[i] <- ifelse(SkeletonDataTmpVersion$`Base Runner 1`[i] == 3, 1 + SkeletonDataTmpVersion$Runs[i-1], 2 + SkeletonDataTmpVersion$Runs[i-1]) } else { SkeletonDataTmpVersion[i,10:13] <- "Error" } } else if (SkeletonDataTmpVersion$Result[i] == 2) { if (SkeletonDataTmpVersion$Outs[i-1] != 3) { SkeletonDataTmpVersion$Inning[i] <- SkeletonDataTmpVersion$Inning[i-1] SkeletonDataTmpVersion$Outs[i] <- SkeletonDataTmpVersion$Outs[i-1] } else { SkeletonDataTmpVersion$Inning[i] <- 1 + SkeletonDataTmpVersion$Inning[i-1] SkeletonDataTmpVersion$Outs[i] <- 0 } if (SkeletonDataTmpVersion$`Base Runner 1`[i-1] == 0 | SkeletonDataTmpVersion$Outs[i-1] == 3) { SkeletonDataTmpVersion$`Base Runner 1`[i] <- 2 SkeletonDataTmpVersion[i,11:12] <- 0 SkeletonDataTmpVersion$Runs[i] <- SkeletonDataTmpVersion$Runs[i-1] } else if (SkeletonDataTmpVersion$`Base Runner 1`[i-1] == 1) { SkeletonDataTmpVersion$`Base Runner 1`[i] <- sample(c(3,2), size = 1, prob = c(1- probrunneron1drivenhomebydouble, probrunneron1drivenhomebydouble)) ################### SkeletonDataTmpVersion$`Base Runner 2`[i] <- ifelse(SkeletonDataTmpVersion$`Base Runner 1`[i] == 3, 2, 0) SkeletonDataTmpVersion$`Base Runner 3`[i] <- 0 SkeletonDataTmpVersion$Runs[i] <- ifelse(SkeletonDataTmpVersion$`Base Runner 1`[i] == 3, SkeletonDataTmpVersion$Runs[i-1], SkeletonDataTmpVersion$Runs[i-1] + 1) } else if (SkeletonDataTmpVersion$`Base Runner 1`[i-1] == 2 & SkeletonDataTmpVersion$`Base Runner 2`[i-1] == 0) { SkeletonDataTmpVersion$`Base Runner 1`[i] <- sample(c(2,3), size = 1, prob = c(1- probrunneron2reaches3ondouble, probrunneron2reaches3ondouble)) ################### SkeletonDataTmpVersion$`Base Runner 2`[i] <- ifelse(SkeletonDataTmpVersion$`Base Runner 1`[i] == 2, 0, 2) SkeletonDataTmpVersion$`Base Runner 3`[i] <- 0 SkeletonDataTmpVersion$Runs[i] <- ifelse(SkeletonDataTmpVersion$`Base Runner 1`[i] == 2, SkeletonDataTmpVersion$Runs[i-1] + 1, SkeletonDataTmpVersion$Runs[i-1]) } else if (SkeletonDataTmpVersion$`Base Runner 1`[i-1] == 2 & SkeletonDataTmpVersion$`Base Runner 2`[i-1] == 1) { SkeletonDataTmpVersion$`Base Runner 1`[i] <- sample(c(3,2), size = 1, prob = c(1- probrunneron1drivenhomebydouble, probrunneron1drivenhomebydouble)) ################### SkeletonDataTmpVersion$`Base Runner 2`[i] <- ifelse(SkeletonDataTmpVersion$`Base Runner 1`[i] == 3, 2, 0) SkeletonDataTmpVersion$`Base Runner 3`[i] <- 0 SkeletonDataTmpVersion$Runs[i] <- ifelse(SkeletonDataTmpVersion$`Base Runner 1`[i] == 3, SkeletonDataTmpVersion$Runs[i-1] + 1, SkeletonDataTmpVersion$Runs[i-1] + 2) } else if (SkeletonDataTmpVersion$`Base Runner 1`[i-1] == 3 & SkeletonDataTmpVersion$`Base Runner 2`[i-1] == 0) { SkeletonDataTmpVersion$`Base Runner 1`[i] <- 2 SkeletonDataTmpVersion[i,11:12] <- 0 SkeletonDataTmpVersion$Runs[i] <- 1 + SkeletonDataTmpVersion$Runs[i-1] } else if (SkeletonDataTmpVersion$`Base Runner 1`[i-1] == 3 & SkeletonDataTmpVersion$`Base Runner 2`[i-1] == 1) { SkeletonDataTmpVersion$`Base Runner 1`[i] <- sample(c(3,2), size = 1, prob = c(1 - probrunneron1drivenhomebydouble, probrunneron1drivenhomebydouble)) ################# SkeletonDataTmpVersion$`Base Runner 2`[i] <- ifelse(SkeletonDataTmpVersion$`Base Runner 1`[i] == 3, 2, 0) SkeletonDataTmpVersion$`Base Runner 3`[i] <- 0 SkeletonDataTmpVersion$Runs[i] <- ifelse(SkeletonDataTmpVersion$`Base Runner 1`[i] == 3, 1 + SkeletonDataTmpVersion$Runs[i-1], 2 + SkeletonDataTmpVersion$Runs[i-1]) } else if (SkeletonDataTmpVersion$`Base Runner 1`[i-1] == 3 & SkeletonDataTmpVersion$`Base Runner 2`[i-1] == 2 & SkeletonDataTmpVersion$`Base Runner 3`[i-1] == 0) { SkeletonDataTmpVersion$`Base Runner 1`[i] <- sample(c(2,3), size = 1, prob = c(1 - probrunneron2reaches3ondouble, probrunneron2reaches3ondouble)) ################# SkeletonDataTmpVersion$`Base Runner 2`[i] <- ifelse(SkeletonDataTmpVersion$`Base Runner 1`[i] == 2, 0, 2) SkeletonDataTmpVersion$`Base Runner 3`[i] <- 0 SkeletonDataTmpVersion$Runs[i] <- ifelse(SkeletonDataTmpVersion$`Base Runner 1`[i] == 2, 2 + SkeletonDataTmpVersion$Runs[i-1], 1 + SkeletonDataTmpVersion$Runs[i-1]) } else if (SkeletonDataTmpVersion$`Base Runner 3`[i-1] == 1) { SkeletonDataTmpVersion$`Base Runner 1`[i] <- sample(c(3,2), size = 1, prob = c(1 - probrunneron1drivenhomebydouble, probrunneron1drivenhomebydouble)) ################# SkeletonDataTmpVersion$`Base Runner 2`[i] <- ifelse(SkeletonDataTmpVersion$`Base Runner 1`[i] == 3, 2, 0) SkeletonDataTmpVersion$`Base Runner 3`[i] <- 0 SkeletonDataTmpVersion$Runs[i] <- ifelse(SkeletonDataTmpVersion$`Base Runner 1`[i] == 3, 2 + SkeletonDataTmpVersion$Runs[i-1], 3 + SkeletonDataTmpVersion$Runs[i-1]) } else { SkeletonDataTmpVersion[i,10:13] <- "Error" } } else if (SkeletonDataTmpVersion$Result[i] <- 3) { if (SkeletonDataTmpVersion$Outs[i-1] != 3) { SkeletonDataTmpVersion$Inning[i] <- SkeletonDataTmpVersion$Inning[i-1] SkeletonDataTmpVersion$Outs[i] <- SkeletonDataTmpVersion$Outs[i-1] } else { SkeletonDataTmpVersion$Inning[i] <- 1 + SkeletonDataTmpVersion$Inning[i-1] SkeletonDataTmpVersion$Outs[i] <- 0 } if (SkeletonDataTmpVersion$`Base Runner 1`[i-1] == 0 | SkeletonDataTmpVersion$Outs[i-1] == 3) { SkeletonDataTmpVersion$`Base Runner 1`[i] <- 3 SkeletonDataTmpVersion[i,11:12] <- 0 SkeletonDataTmpVersion$Runs[i] <- SkeletonDataTmpVersion$Runs[i-1] } else if (SkeletonDataTmpVersion$`Base Runner 1`[i-1] != 0 & SkeletonDataTmpVersion$`Base Runner 2`[i-1] == 0) { SkeletonDataTmpVersion$`Base Runner 1`[i] <- 3 SkeletonDataTmpVersion[i,11:12] <- 0 SkeletonDataTmpVersion$Runs[i] <- SkeletonDataTmpVersion$Runs[i-1] } else if (SkeletonDataTmpVersion$`Base Runner 1`[i-1] != 0 & SkeletonDataTmpVersion$`Base Runner 2`[i-1] != 0 & SkeletonDataTmpVersion$`Base Runner 3`[i-1] == 0) { SkeletonDataTmpVersion$`Base Runner 1`[i] <- 0 SkeletonDataTmpVersion[i,11:12] <- 0 SkeletonDataTmpVersion$Runs[i] <- 2 + SkeletonDataTmpVersion$Runs[i-1] } else if (SkeletonDataTmpVersion$`Base Runner 3`[i-1] == 1) { SkeletonDataTmpVersion$`Base Runner 1`[i] <- 3 SkeletonDataTmpVersion[i,11:12] <- 0 SkeletonDataTmpVersion$Runs[i] <- 3 + SkeletonDataTmpVersion$Runs[i-1] } else { SkeletonDataTmpVersion[i,10:13] <- "Error" } } else if (SkeletonDataTmpVersion$Result[i] == 4) { if (SkeletonDataTmpVersion$Outs[i-1] != 3) { SkeletonDataTmpVersion$Inning[i] <- SkeletonDataTmpVersion$Inning[i-1] SkeletonDataTmpVersion$Outs[i] <- SkeletonDataTmpVersion$Outs[i-1] } else { SkeletonDataTmpVersion$Inning[i] <- 1 + SkeletonDataTmpVersion$Inning[i-1] SkeletonDataTmpVersion$Outs[i] <- 0 } SkeletonDataTmpVersion[i,10:12] <- 0 if (SkeletonDataTmpVersion$`Base Runner 1`[i-1] == 0 | SkeletonDataTmpVersion$Outs[i-1] == 3) { SkeletonDataTmpVersion$Runs[i] <- 1 + SkeletonDataTmpVersion$Runs[i-1] } else if (SkeletonDataTmpVersion$`Base Runner 1`[i-1] != 0 & SkeletonDataTmpVersion$`Base Runner 2`[i-1] == 0) { SkeletonDataTmpVersion$Runs[i] <- 2 + SkeletonDataTmpVersion$Runs[i-1] } else if (SkeletonDataTmpVersion$`Base Runner 1`[i-1] != 0 & SkeletonDataTmpVersion$`Base Runner 2`[i-1] != 0 & SkeletonDataTmpVersion$`Base Runner 3`[i-1] == 0) { SkeletonDataTmpVersion$Runs[i] <- 3 + SkeletonDataTmpVersion$Runs[i-1] } else if (SkeletonDataTmpVersion$`Base Runner 3`[i-1] == 1) { SkeletonDataTmpVersion$Runs[i] <- 4 + SkeletonDataTmpVersion$Runs[i-1] } else { SkeletonDataTmpVersion[i,10:13] <- "Error" } } else { SkeletonDataTmpVersion[i,10:13] <- "Error" } } # for this particular game, place the run total into the run total tracker RunTotalTracker[j,1] <- SkeletonDataTmpVersion$Runs[81] } # Calculate the mean and variance of run totals for all games OrderOptionMean <- mean(RunTotalTracker$Runs) OrderOptionSD <- sd(RunTotalTracker$Runs) # place the answer in the results data frame (permutations_condensed) - each lineup will have an associated mean and standard deviation of runs per game permutations_condensed[k,10] <- OrderOptionMean permutations_condensed[k,11] <- OrderOptionSD # look at permutations_condensed for the results } end_time <- Sys.time() end_time - start_time # Let's put this into words so we can actually read what's going on # The following code converts a matrix of player numbers to a matrix of player names # With this we can easily analyze the results Text_Results <- permutations_condensed for (j in 1:9) { for (i in 1:nrow(Text_Results)) { Text_Results[i,j] = NumberLookupTable[as.numeric(Text_Results[i,j]),2] } } # Rename the last two columns the appropriate names colnames(Text_Results)[10:11] <- c("AvgRuns","SDRuns") # Organize the results in descending order of average runs so we can easily analyze the results Text_Results <- arrange(Text_Results, desc(AvgRuns)) #------------------------------------------------------------------------------- #-----------------------Section 3 - Saving Results-------------------------------- # Save the results in an RDS file (can look at in R) and in an Excel file saveRDS(Text_Results, "EnterNameHere.rds") # Install and reference the package only if the below code isn't working #install.packages("writexl") #library("writexl") writexl::write_xlsx(Text_Results, "EnterNameHere.xlsx") #------------------------------------------------------------------------------- #-----------------Section 4 - Hypothesis Tests--------------------------------- # The below function is code to test the following hypothesis test: # null hypothesis H0: mu1 - mu2 = 0 vs. alternative hypothesis H1: mu1 - mu2 > 0 # Assumptions: samples of lineups are selected independently, variance of both sets of samples are equal # mu1 = mean of AvgRuns for Strategy 1 # mu2 = mean of AvgRuns for Strategy 2 # We are trying to test whether a given strategy (Strategy 1) is truly superior to another strategy (strategy 2) # Our function has 5 inputs: # vector1 is the vector of AvgRuns values associated with Strategy 1 # vector2 is the vector of AvgRuns values associated with Strategy 2 # alpha is the significance level of the test - the highest acceptable probability of a type I error # where a Type I Error is the probability we reject H0 and conclude Strategy 1 is better when in fact H0 is true, indicating Strategy 1 is not better # In other words, we use alpha as a threshold to determine whether a strategy is statistically significant # n1 is the number of lineups associated with Strategy 1 # n2 is the number of lineups associated with Strategy 2 # The function calculates a test statistic (which follows a student-t distribution with n1+n2-2 degrees of freedom) # The p-value is calculated from the test statistic as the probability a student-t distributed random variable is at least as extreme as the test statistic # In other words, the p-value is the probability that we find results at least as extreme as what we observed given H0 is true - if the p-value is small it is likely that H1 is the correct hypothesis # If none of that made sense, no worries - the function will spit out a result in words that either says # "Chosen Strategy is Truly Better" or "Chosen Strategy is Not Significantly Better" GeneralLineupComparisonTest <- function(vector1, vector2, alpha, n1, n2) { pooled_sample_variance <- ((n1-1)*var(vector1)+(n2-1)*var(vector2)) / (n1+n2-2) teststatistic <- (mean(vector1) - mean(vector2) - 0) / sqrt(pooled_sample_variance * (1/n1 + 1/n2)) degfreedom <- n1 + n2 - 2 pval <- 1 - pt(q = teststatistic, df = degfreedom) testresult <- ifelse(pval <= alpha, "Chosen Strategy is Truly Better", "Chosen Strategy is Not Significantly Better") data.frame(pvalue = pval, Result = testresult) } # We can also test the difference between two specific lineups, but I wouldn't recommend doing this # The standard deviation of runs for any lineup is very high, so we are unlikely to pick up on a statistically significant # difference between any two lineups # I would instead recommend the above function which allows us to test the statistical # significance of general strategies # But if you would really like to test between 2 individual lineups, here it is: # null hypothesis H0: mu1 - mu2 = 0 vs. alternative hypothesis H1: mu1 - mu2 > 0 (this is what we're trying to prove) # mu1 = avg runs for supposed "better" batting order # mu2 = avg runs for supposed "worse" batting order # sd1 = standard deviation of runs for supposed "better" batting order # sd2 = sd runs for supposed "worse" batting order # alpha = significance level of the hypothesis test # n = number of game simulations run per batting order # Assuming the probability distribution of runs is approximately normally distributed # and the sample of batting outcomes is chosen randomly and independently, we can run # a separate variance t test # Let's write the function that will calculate the result of our test. LineupComparisonTest <- function(mu1, mu2, sd1, sd2, alpha, n) { teststatistic <- ((mu1 - mu2) - 0) / sqrt(((sd1^2)/n) + ((sd2^2)/n)) degfreedom <- (((sd1^2)/n) + ((sd2^2)/n))^2 / ((((sd1^2)/n)^2 / (n-1)) + (((sd1^2)/n)^2 / (n-1))) pval <- 1 - pt(q = teststatistic, df = degfreedom) testresult <- ifelse(pval <= alpha, "Evidence of Improvement", "No Evidence of Improvement") data.frame(pvalue = pval, Result = testresult) } # use the words provided in the result as a guide. In general, if the p-value is less than or equal to alpha, we reject the null hypothesis # in favor of the alternative hypothesis #------------------------------------------------------------------------------- #--------------------Section 5 - Examples of How to Use Hypothesis Tests--------- # The following was my analysis for general strategies. You will need to design # code to your own liking depending on what you choose to investigate # the below info is a bit messy at times but is a good example for how to use # hypothesis tests to draw conclusions # Observations # Smith appears to be the favorite for leadoff, with Simmons showing up as well # Smith, Morton, Simmons, and Freeberger all appear in top lineups at the 2 spot # Third is a Freeberger, Huber, Morton mix # Huber and Freeberger (not Morton) appear to dominate the 4 spot # Morton and Simmons are the top options at 5 # Studley appears in some good lineups but he is no longer a must-start # Daniels and Brini are given much more favor in this iteration - we may need to start them # Garbowski still appears to be a must (Hyde shows up alongside him a few times) # Hard to say if Broadhurst or Dalena are still important to have in the lineup - doesn't look like it # The complications enable contact hitters to earn extra bases, giving favor to the contact hitters # Smith vs Simmons at leadoff? Smith1 <- Text_Results %>% filter(first == "David Smith") Simmons1 <- Text_Results %>% filter(first == "T.C. Simmons") # if Simmons is 1, Smith has to be 2 Smith1Simmons2 <- Text_Results %>% filter(first == "David Smith" & second == "T.C. Simmons") Smith1Simmons5 <- Text_Results %>% filter(first == "David Smith" & fifth == "T.C. Simmons") # Smith 1 and Simmons 2 or 5 vs. Simmons 1 Smith 2 GeneralLineupComparisonTest(vector1 = Smith1$AvgRuns, vector2 = Simmons1$AvgRuns, alpha = 0.10, n1 = nrow(Smith1), n2 = nrow(Simmons1)) # pval = 0.79 - inconclusive # Smith 1 Simmons 2 vs. Simmons 1 Smith 2 GeneralLineupComparisonTest(vector1 = Smith1Simmons2$AvgRuns, vector2 = Simmons1$AvgRuns, alpha = 0.10, n1 = nrow(Smith1Simmons2), n2 = nrow(Simmons1)) # pval = 0.258 - inconclusive # Smith 1 Simmons 5 vs. Simmons 1 Smith 2 GeneralLineupComparisonTest(vector1 = Smith1Simmons5$AvgRuns, vector2 = Simmons1$AvgRuns, alpha = 0.10, n1 = nrow(Smith1Simmons5), n2 = nrow(Simmons1)) # pval = 0.94 - Simmons 1 Smith 2 beats Smith 1 Simmons 5, but not Smith 1 Simmons 2 # Smith 1 Simmons 5 vs Smith 1 Simmons 2 GeneralLineupComparisonTest(vector1 = Smith1Simmons2$AvgRuns, vector2 = Smith1Simmons5$AvgRuns, alpha = 0.10, n1 = nrow(Smith1Simmons2), n2 = nrow(Smith1Simmons5)) # pval = 0.012 - Smith 1 Simmons 2 wins # Conclusions: Smith 1 Simmons 5 is not the best strategy. If Smith leads off, it is better to put Simmons 2. mean(Smith1Simmons2$AvgRuns) mean(Simmons1$AvgRuns) mean(Smith1Simmons5$AvgRuns) # Does Simmons 5 beat the alternatives? Simmons5 <- Text_Results %>% filter(fifth == "T.C. Simmons") NotSimmons5 <- Text_Results %>% filter(fifth != "T.C. Simmons") GeneralLineupComparisonTest(vector1 = Simmons5$AvgRuns, vector2 = NotSimmons5$AvgRuns, alpha = 0.10, n1 = nrow(Simmons5), n2 = nrow(NotSimmons5)) # Winner - Morton 5 - confirms prior guess - Simmons should be 1 or 2 mean(Simmons5$AvgRuns) mean(NotSimmons5$AvgRuns) Morton5 <- Text_Results %>% filter(fifth == "Korey Morton") Morton4 <- Text_Results %>% filter(fourth == "Korey Morton") Morton3 <- Text_Results %>% filter(third == "Korey Morton") Morton34 <- Text_Results %>% filter(third == "Korey Morton" | fourth == "Korey Morton") GeneralLineupComparisonTest(vector1 = Morton5$AvgRuns, vector2 = Morton34$AvgRuns, alpha = 0.10, n1 = nrow(Morton5), n2 = nrow(Morton34)) GeneralLineupComparisonTest(vector1 = Morton5$AvgRuns, vector2 = Morton3$AvgRuns, alpha = 0.10, n1 = nrow(Morton5), n2 = nrow(Morton3)) GeneralLineupComparisonTest(vector1 = Morton5$AvgRuns, vector2 = Morton4$AvgRuns, alpha = 0.10, n1 = nrow(Morton5), n2 = nrow(Morton4)) mean(Morton5$AvgRuns) mean(Morton3$AvgRuns) mean(Morton4$AvgRuns) mean(Morton34$AvgRuns) # Morton should bat either 3rd or 5th F3H4M5 <- Text_Results %>% filter(third == "Dominic Freeberger" & fourth == "Ben Huber" & fifth == "Korey Morton") H3F4M5 <- Text_Results %>% filter(fourth == "Dominic Freeberger" & third == "Ben Huber" & fifth == "Korey Morton") M3H4F2 <- Text_Results %>% filter(third == "Korey Morton" & fourth == "Ben Huber" & second == "Dominic Freeberger") M4H3F2 <- Text_Results %>% filter(fourth == "Korey Morton" & third == "Ben Huber" & second == "Dominic Freeberger") mean(F3H4M5$AvgRuns) mean(H3F4M5$AvgRuns) mean(M3H4F2$AvgRuns) mean(M4H3F2$AvgRuns) GeneralLineupComparisonTest(vector1 = F3H4M5$AvgRuns, vector2 = H3F4M5$AvgRuns, alpha = 0.10, n1 = nrow(F3H4M5), n2 = nrow(H3F4M5)) # pval = 0.0004 GeneralLineupComparisonTest(vector1 = F3H4M5$AvgRuns, vector2 = M3H4F2$AvgRuns, alpha = 0.10, n1 = nrow(F3H4M5), n2 = nrow(M3H4F2)) # pval = 0.016 # Does (Smith or Simmons) beat (Morton or Freeberger) SmithorSimmons2 <- Text_Results %>% filter(second == "T.C. Simmons" | second == "David Smith") MortonorFreeberger2 <- Text_Results %>% filter(second == "Korey Morton" | second == "Dominic Freeberger") GeneralLineupComparisonTest(vector1 = SmithorSimmons2$AvgRuns, vector2 = MortonorFreeberger2$AvgRuns, alpha = 0.10, n1 = nrow(SmithorSimmons2), n2 = nrow(MortonorFreeberger2)) # pval = 0.0099 - Simmons or Smith at 1 win # Conclusion - 1-2 should be Smith/Simmons or Simmons/Smith, Morton should be 5 # Freeberger or Huber 3rd? Huber3Freeberger4 <- Text_Results %>% filter(third == "Ben Huber" & fourth == "Dominic Freeberger") Freeberger3Huber4 <- Text_Results %>% filter(third == "Dominic Freeberger" & fourth == "Ben Huber") GeneralLineupComparisonTest(vector1 = Freeberger3Huber4$AvgRuns, vector2 = Huber3Freeberger4$AvgRuns, alpha = 0.10, n1 = nrow(Freeberger3Huber4), n2 = nrow(Huber3Freeberger4)) # pval = 0 - winner: Freeberger at 3 and Huber at 4 mean(Huber3Freeberger4$AvgRuns) mean(Freeberger3Huber4$AvgRuns) # Conclusions: the lineup should be: Smith/Simmons, Smith/Simmons, Freeberger, Huber, Morton ### Next - figure out why - look at raw data for answers ### Also - test the following # Studley vs. Brini - does Studley still win? - will require more complicated code StudleyIn <- Text_Results %>% filter(first == "Jake Studley" | second == "Jake Studley" | third == "Jake Studley" | fourth == "Jake Studley" | fifth == "Jake Studley" | sixth == "Jake Studley" | seventh == "Jake Studley" | eigth == "Jake Studley" | ninth == "Jake Studley") BriniIn <- Text_Results %>% filter(first == "Niko Brini" | second == "Niko Brini" | third == "Niko Brini" | fourth == "Niko Brini" | fifth == "Niko Brini" | sixth == "Niko Brini" | seventh == "Niko Brini" | eigth == "Niko Brini" | ninth == "Niko Brini") GeneralLineupComparisonTest(vector1 = StudleyIn$AvgRuns, vector2 = BriniIn$AvgRuns, alpha = 0.10, n1 = nrow(StudleyIn), n2 = nrow(BriniIn)) # pval = 0.99 - So Brini beats Studley now mean(BriniIn$AvgRuns) mean(StudleyIn$AvgRuns) # Dalena vs. Daniels vs. Broadhurst - does Daniels still lose? # Monday morning - gather all results and consider if you want to run anything else DalenaIn <- Text_Results %>% filter(first == "Maddix Dalena" | second == "Maddix Dalena" | third == "Maddix Dalena" | fourth == "Maddix Dalena" | fifth == "Maddix Dalena" | sixth == "Maddix Dalena" | seventh == "Maddix Dalena" | eigth == "Maddix Dalena" | ninth == "Maddix Dalena") DanielsIn <- Text_Results %>% filter(first == "Ryan Daniels" | second == "Ryan Daniels" | third == "Ryan Daniels" | fourth == "Ryan Daniels" | fifth == "Ryan Daniels" | sixth == "Ryan Daniels" | seventh == "Ryan Daniels" | eigth == "Ryan Daniels" | ninth == "Ryan Daniels") BroadhurstIn <- Text_Results %>% filter(first == "Luke Broadhurst" | second == "Luke Broadhurst" | third == "Luke Broadhurst" | fourth == "Luke Broadhurst" | fifth == "Luke Broadhurst" | sixth == "Luke Broadhurst" | seventh == "Luke Broadhurst" | eigth == "Luke Broadhurst" | ninth == "Luke Broadhurst") GeneralLineupComparisonTest(vector1 = BroadhurstIn$AvgRuns, vector2 = DanielsIn$AvgRuns, alpha = 0.10, n1 = nrow(BroadhurstIn), n2 = nrow(DanielsIn)) GeneralLineupComparisonTest(vector1 = BroadhurstIn$AvgRuns, vector2 = DalenaIn$AvgRuns, alpha = 0.10, n1 = nrow(BroadhurstIn), n2 = nrow(DalenaIn)) GeneralLineupComparisonTest(vector1 = DalenaIn$AvgRuns, vector2 = DanielsIn$AvgRuns, alpha = 0.10, n1 = nrow(DalenaIn), n2 = nrow(DanielsIn)) # p-values of zero suggest Broadhurst > Dalena > Daniels # We know the starting lineup, so let's now see if we can determine the best back-end order BestStarters <- Text_Results %>% filter((sixth == "Luke Broadhurst" | seventh == "Luke Broadhurst") & (sixth == "Matt Garbowski" | seventh == "Matt Garbowski" | eigth == "Matt Garbowski" | ninth == "Matt Garbowski") & (seventh == "Niko Brini" | eigth == "Niko Brini" | ninth == "Niko Brini")) # check Broadhurst 6 vs. Broadhurst 7 Broadhurst6 <- BestStarters %>% filter(sixth == "Luke Broadhurst") Broadhurst7 <- BestStarters %>% filter(seventh == "Luke Broadhurst") GeneralLineupComparisonTest(vector1 = Broadhurst6$AvgRuns, vector2 = Broadhurst7$AvgRuns, alpha = 0.10, n1 = nrow(Broadhurst6), n2 = nrow(Broadhurst7)) # very weak preference towards Broadhurst at 6 # look for best Padilla placement Padilla6 <- BestStarters %>% filter(sixth == "Bryan Padilla") Padilla7 <- BestStarters %>% filter(seventh == "Bryan Padilla") Padilla8 <- BestStarters %>% filter(eigth == "Bryan Padilla") Padilla9 <- BestStarters %>% filter(ninth == "Bryan Padilla") mean(Padilla6$AvgRuns) mean(Padilla7$AvgRuns) mean(Padilla8$AvgRuns) mean(Padilla9$AvgRuns) NotPadilla7 <- BestStarters %>% filter(seventh != "Bryan Padilla") GeneralLineupComparisonTest(vector1 = Padilla7$AvgRuns, vector2 = NotPadilla7$AvgRuns, alpha = 0.10, n1 = nrow(Padilla7), n2 = nrow(NotPadilla7)) # pval = 0.0585 - Padilla 7 wins GeneralLineupComparisonTest(vector1 = Padilla7$AvgRuns, vector2 = Padilla9$AvgRuns, alpha = 0.10, n1 = nrow(Padilla7), n2 = nrow(Padilla9)) # pval = 0.118 GeneralLineupComparisonTest(vector1 = Padilla7$AvgRuns, vector2 = Padilla6$AvgRuns, alpha = 0.10, n1 = nrow(Padilla7), n2 = nrow(Padilla6)) # pval = 0.095 # Results weakly suggest Padilla at 7 # Look for best Garbowski placement Garbowski6 <- BestStarters %>% filter(sixth == "Matt Garbowski") Garbowski7 <- BestStarters %>% filter(seventh == "Matt Garbowski") Garbowski8 <- BestStarters %>% filter(eigth == "Matt Garbowski") Garbowski9 <- BestStarters %>% filter(ninth == "Matt Garbowski") mean(Garbowski6$AvgRuns) mean(Garbowski7$AvgRuns) mean(Garbowski8$AvgRuns) mean(Garbowski9$AvgRuns) NotGarbowski9 <- BestStarters %>% filter(ninth != "Matt Garbowski") GeneralLineupComparisonTest(vector1 = Garbowski9$AvgRuns, vector2 = NotGarbowski9$AvgRuns, alpha = 0.10, n1 = nrow(Garbowski9), n2 = nrow(NotGarbowski9)) # pval = 0.034 - Garbowski 9 wins GeneralLineupComparisonTest(vector1 = Garbowski9$AvgRuns, vector2 = Garbowski8$AvgRuns, alpha = 0.10, n1 = nrow(Garbowski9), n2 = nrow(Garbowski8)) # pval = 0.033 GeneralLineupComparisonTest(vector1 = Garbowski9$AvgRuns, vector2 = Garbowski7$AvgRuns, alpha = 0.10, n1 = nrow(Garbowski9), n2 = nrow(Garbowski7)) # pval = 0.15 (likely due to a lack of data points) # Results weakly suggest Garbowski at 9 # Look for best Brini placement Brini7 <- BestStarters %>% filter(seventh == "Niko Brini") Brini8 <- BestStarters %>% filter(eigth == "Niko Brini") Brini9 <- BestStarters %>% filter(ninth == "Niko Brini") mean(Brini7$AvgRuns) mean(Brini8$AvgRuns) mean(Brini9$AvgRuns) NotBrini8 <- BestStarters %>% filter(eigth != "Niko Brini") GeneralLineupComparisonTest(vector1 = Brini8$AvgRuns, vector2 = NotBrini8$AvgRuns, alpha = 0.10, n1 = nrow(Brini8), n2 = nrow(NotBrini8)) # pval = 0.030 - Brini 8 wins GeneralLineupComparisonTest(vector1 = Brini8$AvgRuns, vector2 = Brini7$AvgRuns, alpha = 0.10, n1 = nrow(Brini8), n2 = nrow(Brini7)) # pval = 0.134 GeneralLineupComparisonTest(vector1 = Garbowski9$AvgRuns, vector2 = Garbowski7$AvgRuns, alpha = 0.10, n1 = nrow(Garbowski9), n2 = nrow(Garbowski7)) # Results weakly suggest Brini at 8 # check on previous conclusions - Result: all the same # Smith vs Simmons at leadoff? Smith1b <- BestStarters %>% filter(first == "David Smith") Simmons1b <- BestStarters %>% filter(first == "T.C. Simmons") # if Simmons is 1, Smith has to be 2 Smith1Simmons2b <- BestStarters %>% filter(first == "David Smith" & second == "T.C. Simmons") Smith1Simmons5b <- BestStarters %>% filter(first == "David Smith" & fifth == "T.C. Simmons") # Smith 1 and Simmons 2 or 5 vs. Simmons 1 Smith 2 GeneralLineupComparisonTest(vector1 = Smith1b$AvgRuns, vector2 = Simmons1b$AvgRuns, alpha = 0.10, n1 = nrow(Smith1b), n2 = nrow(Simmons1b)) # pval = 0.91 - weak preference towards Simmons at 1 - but may be because of dislike for Simmons at 5 # Smith 1 Simmons 2 vs. Simmons 1 Smith 2 GeneralLineupComparisonTest(vector1 = Smith1Simmons2b$AvgRuns, vector2 = Simmons1b$AvgRuns, alpha = 0.10, n1 = nrow(Smith1Simmons2b), n2 = nrow(Simmons1b)) # pval = 0.661 - inconclusive # Smith 1 Simmons 5 vs. Simmons 1 Smith 2 GeneralLineupComparisonTest(vector1 = Smith1Simmons5b$AvgRuns, vector2 = Simmons1b$AvgRuns, alpha = 0.10, n1 = nrow(Smith1Simmons5b), n2 = nrow(Simmons1b)) # pval = 0.948 - Simmons 1 Smith 2 beats Smith 1 Simmons 5 # Smith 1 Simmons 5 vs Smith 1 Simmons 2 GeneralLineupComparisonTest(vector1 = Smith1Simmons2b$AvgRuns, vector2 = Smith1Simmons5b$AvgRuns, alpha = 0.10, n1 = nrow(Smith1Simmons2b), n2 = nrow(Smith1Simmons5b)) # pval = 0.10 - Smith 1 Simmons 2 wins by a thin margin # Conclusions: Smith 1 Simmons 5 is not the best strategy. If Smith leads off, it is better to put Simmons 2. mean(Smith1Simmons2b$AvgRuns) mean(Simmons1b$AvgRuns) mean(Smith1Simmons5b$AvgRuns) # Does Simmons 5 beat the alternatives? Simmons5b <- BestStarters %>% filter(fifth == "T.C. Simmons") NotSimmons5b <- BestStarters %>% filter(fifth != "T.C. Simmons") GeneralLineupComparisonTest(vector1 = Simmons5b$AvgRuns, vector2 = NotSimmons5b$AvgRuns, alpha = 0.10, n1 = nrow(Simmons5b), n2 = nrow(NotSimmons5b)) # pval = 0.966 - Winner - Morton 5 - confirms prior guess - Simmons should be 1 or 2 mean(Simmons5b$AvgRuns) mean(NotSimmons5b$AvgRuns) # Does (Smith or Simmons) beat (Morton or Freeberger) SmithorSimmons2b <- BestStarters %>% filter(second == "T.C. Simmons" | second == "David Smith") MortonorFreeberger2b <- BestStarters %>% filter(second == "Korey Morton" | second == "Dominic Freeberger") GeneralLineupComparisonTest(vector1 = SmithorSimmons2b$AvgRuns, vector2 = MortonorFreeberger2b$AvgRuns, alpha = 0.10, n1 = nrow(SmithorSimmons2b), n2 = nrow(MortonorFreeberger2b)) # pval = 0.033 - Simmons or Smith at 1/2 win # Conclusion - 1-2 should be Smith/Simmons or Simmons/Smith, Morton should be 5 # Freeberger or Huber 3rd? Huber3Freeberger4b <- BestStarters %>% filter(third == "Ben Huber" & fourth == "Dominic Freeberger") Freeberger3Huber4b <- BestStarters %>% filter(third == "Dominic Freeberger" & fourth == "Ben Huber") GeneralLineupComparisonTest(vector1 = Freeberger3Huber4b$AvgRuns, vector2 = Huber3Freeberger4b$AvgRuns, alpha = 0.10, n1 = nrow(Freeberger3Huber4b), n2 = nrow(Huber3Freeberger4b)) # pval = 0.065 - winner: Freeberger at 3 and Huber at 4 mean(Huber3Freeberger4b$AvgRuns) mean(Freeberger3Huber4b$AvgRuns) Broadhurst6Padilla7Brini8Garbowski9 <- BestStarters %>% filter(sixth == "Luke Broadhurst" & seventh == "Bryan Padilla" & eigth == "Niko Brini" & ninth == "Matt Garbowski") Broadhurst7Padilla6Brini8Garbowski9 <- BestStarters %>% filter(seventh == "Luke Broadhurst" & sixth == "Bryan Padilla" & eigth == "Niko Brini" & ninth == "Matt Garbowski") Broadhurst7Garbowski6Brini8Padilla9 <- BestStarters %>% filter(seventh == "Luke Broadhurst" & ninth == "Bryan Padilla" & eigth == "Niko Brini" & sixth == "Matt Garbowski") Broadhurst6Padilla9Brini8Garbowski7 <- BestStarters %>% filter(sixth == "Luke Broadhurst" & ninth == "Bryan Padilla" & eigth == "Niko Brini" & seventh == "Matt Garbowski") mean(Broadhurst6Padilla7Brini8Garbowski9$AvgRuns) mean(Broadhurst7Padilla6Brini8Garbowski9$AvgRuns) mean(Broadhurst7Garbowski6Brini8Padilla9$AvgRuns) mean(Broadhurst6Padilla9Brini8Garbowski7$AvgRuns) GeneralLineupComparisonTest(vector1 = Broadhurst6Padilla7Brini8Garbowski9$AvgRuns, vector2 = Broadhurst7Padilla6Brini8Garbowski9$AvgRuns, alpha = 0.10, n1 = nrow(Broadhurst6Padilla7Brini8Garbowski9), n2 = nrow(Broadhurst7Padilla6Brini8Garbowski9)) # pval = 0.489 GeneralLineupComparisonTest(vector1 = Broadhurst6Padilla7Brini8Garbowski9$AvgRuns, vector2 = Broadhurst7Garbowski6Brini8Padilla9$AvgRuns, alpha = 0.10, n1 = nrow(Broadhurst6Padilla7Brini8Garbowski9), n2 = nrow(Broadhurst7Garbowski6Brini8Padilla9)) # pval = 0.395 GeneralLineupComparisonTest(vector1 = Broadhurst6Padilla7Brini8Garbowski9$AvgRuns, vector2 = Broadhurst6Padilla9Brini8Garbowski7$AvgRuns, alpha = 0.10, n1 = nrow(Broadhurst6Padilla7Brini8Garbowski9), n2 = nrow(Broadhurst6Padilla9Brini8Garbowski7)) # pval = 0.3325 # Overall Conclusions: # Results are much more of a mixed bag when expand the complexity of our model # Although very much a mixed bag, the top 5 is: # Smith/Simmons, Smith/Simmons, Freeberger, Huber, Morton # Starting lineup: Smith Simmons Freeberger Huber Morton Padilla Garbowski Broadhurst Brini # Lineup Recommendation: Smith/Simmons Smith/Simmons Freeberger Huber Morton Broadhurst Padilla Brini Garbowski #saveRDS(BestStarters, file = "BestStarters.rds") #writexl::write_xlsx(BestStarters, "BestStarters.xlsx") # Examle of hypothesis test between two individual lineups # Does the best lineup truly beat the best lineup in which Huber bats third and Freeberger bats fourth (lineup 46)? LineupComparisonTest(Text_Results$AvgRuns[1], Text_Results$AvgRuns[46], Text_Results$SDRuns[1], Text_Results$SDRuns[46], 0.20, 200) # ANSWER: pval = 0.343