library(readr)
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.2
## corrplot 0.84 loaded
library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(RColorBrewer)
library(rpart)
library(rpart.plot)
library(class)
library(e1071)
Pokemon <- read_csv("~/Downloads/datasetpokemon.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## Number = col_integer(),
## Total = col_integer(),
## HP = col_integer(),
## Attack = col_integer(),
## Defense = col_integer(),
## Sp_Atk = col_integer(),
## Sp_Def = col_integer(),
## Speed = col_integer(),
## Generation = col_integer(),
## Pr_Male = col_double(),
## Height_m = col_double(),
## Weight_kg = col_double(),
## Catch_Rate = col_integer()
## )
## See spec(...) for full column specifications.
print(Pokemon)
## # A tibble: 721 x 23
## Number Name Type_1 Type_2 Total HP Attack Defense Sp_Atk
## <int> <chr> <chr> <chr> <int> <int> <int> <int> <int>
## 1 1 Bulbasaur Grass Poison 318 45 49 49 65
## 2 2 Ivysaur Grass Poison 405 60 62 63 80
## 3 3 Venusaur Grass Poison 525 80 82 83 100
## 4 4 Charmander Fire <NA> 309 39 52 43 60
## 5 5 Charmeleon Fire <NA> 405 58 64 58 80
## 6 6 Charizard Fire Flying 534 78 84 78 109
## 7 7 Squirtle Water <NA> 314 44 48 65 50
## 8 8 Wartortle Water <NA> 405 59 63 80 65
## 9 9 Blastoise Water <NA> 530 79 83 100 85
## 10 10 Caterpie Bug <NA> 195 45 30 35 20
## # ... with 711 more rows, and 14 more variables: Sp_Def <int>,
## # Speed <int>, Generation <int>, isLegendary <chr>, Color <chr>,
## # hasGender <chr>, Pr_Male <dbl>, Egg_Group_1 <chr>, Egg_Group_2 <chr>,
## # hasMegaEvolution <chr>, Height_m <dbl>, Weight_kg <dbl>,
## # Catch_Rate <int>, Body_Style <chr>
library(corrplot)
Pokemon1 <- data.frame(Pokemon)
Pokemon1 <- Pokemon1[, c(5, 6, 7, 8, 9, 10, 11)]
P <- cor(Pokemon1)
col1 <- colorRampPalette(c("#7F0000", "red", "#FF7F00", "yellow", "white", "cyan", "#007FFF", "blue", "#00007F"))
col2 <- colorRampPalette(c("#67001F", "#B2182B", "#D6604D", "#F4A582", "#FDDBC7", "#FFFFFF", "#D1E5F0", "#92C5DE", "#4393C3", "#2166AC", "#053061"))
col3 <- colorRampPalette(c("red", "white", "blue"))
col4 <- colorRampPalette(c("#7F0000", "red", "#FF7F00", "yellow", "#7FFF7F", "cyan", "#007FFF", "blue", "#00007F"))
wb <- c("white", "black")
par(ask=TRUE)
corrplot(P, method ="number", title = "Correlation Between Stats")
corrplot(P, title = "Correlation Between Stats")
corrplot(P, order = "original", addCoef.col = "grey", title="Correlation Between Stats")
Pokemon_ <- read_csv("~/Downloads/datasetpokemon.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## Number = col_integer(),
## Total = col_integer(),
## HP = col_integer(),
## Attack = col_integer(),
## Defense = col_integer(),
## Sp_Atk = col_integer(),
## Sp_Def = col_integer(),
## Speed = col_integer(),
## Generation = col_integer(),
## Pr_Male = col_double(),
## Height_m = col_double(),
## Weight_kg = col_double(),
## Catch_Rate = col_integer()
## )
## See spec(...) for full column specifications.
library(ggplot2)
generationdistribution <- ggplot(Pokemon_, aes(x=Pokemon_$Generation)) + geom_bar(stat="count", fill="yellow") + labs(x='Generation', y='Count', title='Count of Pokemon in Each Generation')
generationdistribution
colnames(Pokemon_) <- make.names(names(Pokemon_))
ggplot(Pokemon_, aes(x=Total, fill=Generation)) +
facet_wrap(~Generation) +
geom_histogram(color="darkred") +
theme(legend.position = 'center') +
labs(x='Total Stats', y='Count', title = 'Distribution of Total Stats by Generation')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
colnames(Pokemon_) <- make.names(names(Pokemon_))
ggplot(Pokemon_, aes(x=Attack, fill=Generation)) +
facet_wrap(~Generation) +
geom_histogram(color="pink") +
theme(legend.position = 'none') +
labs(x='Attack', y='Count', title = 'Distribution of Attack by Generation')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(Pokemon_, aes(x=Sp_Atk, fill=Generation)) + facet_wrap(~Generation) +
geom_histogram(color="yellow") +
theme(legend.position = 'none') +
labs(x='Speed Attack', y='Count', title = 'Distribution of Speed Attack by Generation')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot1 <-ggplot(Pokemon_, aes(x=Total, y=Attack, color=Generation)) + geom_point() + labs(x='Total Stats', y='Attack', title = 'Plot of Total Stats by Attack based on Generation')
plot1
plot1 <-ggplot(Pokemon_, aes(x=Total, y=Sp_Atk, color=Generation)) + geom_point() + labs(x='Total', y='Sp_Atk', title = 'Plot of Total Stats by Speed Attack based on Generation')
plot1
Attack <- as.numeric(Pokemon_$Attack)
Total <- as.numeric(Pokemon_$Total)
SpeedAttack <- as.numeric(Pokemon_$Sp_Atk)
tree_classifier <- rpart(Pokemon_$Generation ~ Total, data = Pokemon_, method = 'class', control = rpart.control(maxdepth = 20, minsplit = 100))
summary(tree_classifier)
## Call:
## rpart(formula = Pokemon_$Generation ~ Total, data = Pokemon_,
## method = "class", control = rpart.control(maxdepth = 20,
## minsplit = 100))
## n= 721
##
## CP nsplit rel error xerror xstd
## 1 0.02477876 0 1.0000000 1.017699 0.01909828
## 2 0.01000000 1 0.9752212 1.042478 0.01837928
##
## Variable importance
## Total
## 100
##
## Node number 1: 721 observations, complexity param=0.02477876
## predicted class=5 expected loss=0.7836338 P(node) =1
## class counts: 151 100 135 107 156 72
## probabilities: 0.209 0.139 0.187 0.148 0.216 0.100
## left son=2 (43 obs) right son=3 (678 obs)
## Primary splits:
## Total < 250.5 to the left, improve=4.572532, (0 missing)
##
## Node number 2: 43 observations
## predicted class=2 expected loss=0.6744186 P(node) =0.05963939
## class counts: 6 14 13 6 0 4
## probabilities: 0.140 0.326 0.302 0.140 0.000 0.093
##
## Node number 3: 678 observations
## predicted class=5 expected loss=0.7699115 P(node) =0.9403606
## class counts: 145 86 122 101 156 68
## probabilities: 0.214 0.127 0.180 0.149 0.230 0.100
rpart.plot(tree_classifier)
title("Prediction Generation on Total")
Attack <- as.numeric(Pokemon_$Attack)
Total <- as.numeric(Pokemon_$Total)
SpeedAttack <- as.numeric(Pokemon_$Sp_Atk)
tree_classifier2 <- rpart(Pokemon_$Generation ~ Total + Attack, data = Pokemon_, method = 'class', control = rpart.control(maxdepth = 20, minsplit = 60))
summary(tree_classifier2)
## Call:
## rpart(formula = Pokemon_$Generation ~ Total + Attack, data = Pokemon_,
## method = "class", control = rpart.control(maxdepth = 20,
## minsplit = 60))
## n= 721
##
## CP nsplit rel error xerror xstd
## 1 0.02477876 0 1.0000000 1.008850 0.01933792
## 2 0.01000000 1 0.9752212 1.014159 0.01919518
##
## Variable importance
## Total Attack
## 93 7
##
## Node number 1: 721 observations, complexity param=0.02477876
## predicted class=5 expected loss=0.7836338 P(node) =1
## class counts: 151 100 135 107 156 72
## probabilities: 0.209 0.139 0.187 0.148 0.216 0.100
## left son=2 (43 obs) right son=3 (678 obs)
## Primary splits:
## Total < 250.5 to the left, improve=4.572532, (0 missing)
## Attack < 84.5 to the left, improve=2.965793, (0 missing)
## Surrogate splits:
## Attack < 22.5 to the left, agree=0.945, adj=0.07, (0 split)
##
## Node number 2: 43 observations
## predicted class=2 expected loss=0.6744186 P(node) =0.05963939
## class counts: 6 14 13 6 0 4
## probabilities: 0.140 0.326 0.302 0.140 0.000 0.093
##
## Node number 3: 678 observations
## predicted class=5 expected loss=0.7699115 P(node) =0.9403606
## class counts: 145 86 122 101 156 68
## probabilities: 0.214 0.127 0.180 0.149 0.230 0.100
rpart.plot(tree_classifier2)
title("Prediction Generation on Total, Attack")
Attack <- as.numeric(Pokemon_$Attack)
Total <- as.numeric(Pokemon_$Total)
SpeedAttack <- as.numeric(Pokemon_$Sp_Atk)
tree_classifier3 <- rpart(Pokemon_$Generation ~ Total + Attack + SpeedAttack, data = Pokemon_, method = 'class', control = rpart.control(maxdepth = 20, minsplit = 60))
summary(tree_classifier3)
## Call:
## rpart(formula = Pokemon_$Generation ~ Total + Attack + SpeedAttack,
## data = Pokemon_, method = "class", control = rpart.control(maxdepth = 20,
## minsplit = 60))
## n= 721
##
## CP nsplit rel error xerror xstd
## 1 0.02477876 0 1.0000000 1.067257 0.01758262
## 2 0.01179941 1 0.9752212 1.040708 0.01843310
## 3 0.01061947 5 0.9274336 1.046018 0.01827045
## 4 0.01002950 6 0.9168142 1.044248 0.01832507
## 5 0.01000000 9 0.8867257 1.044248 0.01832507
##
## Variable importance
## Total Attack SpeedAttack
## 57 27 16
##
## Node number 1: 721 observations, complexity param=0.02477876
## predicted class=5 expected loss=0.7836338 P(node) =1
## class counts: 151 100 135 107 156 72
## probabilities: 0.209 0.139 0.187 0.148 0.216 0.100
## left son=2 (43 obs) right son=3 (678 obs)
## Primary splits:
## Total < 250.5 to the left, improve=4.572532, (0 missing)
## Attack < 84.5 to the left, improve=2.965793, (0 missing)
## SpeedAttack < 115.5 to the left, improve=2.694889, (0 missing)
## Surrogate splits:
## SpeedAttack < 29.5 to the left, agree=0.950, adj=0.163, (0 split)
## Attack < 22.5 to the left, agree=0.945, adj=0.070, (0 split)
##
## Node number 2: 43 observations
## predicted class=2 expected loss=0.6744186 P(node) =0.05963939
## class counts: 6 14 13 6 0 4
## probabilities: 0.140 0.326 0.302 0.140 0.000 0.093
##
## Node number 3: 678 observations, complexity param=0.01179941
## predicted class=5 expected loss=0.7699115 P(node) =0.9403606
## class counts: 145 86 122 101 156 68
## probabilities: 0.214 0.127 0.180 0.149 0.230 0.100
## left son=6 (475 obs) right son=7 (203 obs)
## Primary splits:
## Total < 492 to the left, improve=3.668560, (0 missing)
## SpeedAttack < 115.5 to the left, improve=2.395716, (0 missing)
## Attack < 111 to the left, improve=2.336745, (0 missing)
## Surrogate splits:
## SpeedAttack < 96 to the left, agree=0.798, adj=0.325, (0 split)
## Attack < 97.5 to the left, agree=0.788, adj=0.291, (0 split)
##
## Node number 6: 475 observations, complexity param=0.01179941
## predicted class=5 expected loss=0.76 P(node) =0.6588072
## class counts: 108 61 97 54 114 41
## probabilities: 0.227 0.128 0.204 0.114 0.240 0.086
## left son=12 (435 obs) right son=13 (40 obs)
## Primary splits:
## Total < 482.5 to the left, improve=2.814785, (0 missing)
## SpeedAttack < 54.5 to the left, improve=1.908587, (0 missing)
## Attack < 84.5 to the left, improve=1.677121, (0 missing)
## Surrogate splits:
## SpeedAttack < 122.5 to the left, agree=0.918, adj=0.025, (0 split)
##
## Node number 7: 203 observations
## predicted class=4 expected loss=0.7684729 P(node) =0.2815534
## class counts: 37 25 25 47 42 27
## probabilities: 0.182 0.123 0.123 0.232 0.207 0.133
##
## Node number 12: 435 observations, complexity param=0.01179941
## predicted class=1 expected loss=0.7770115 P(node) =0.6033287
## class counts: 97 58 92 51 96 41
## probabilities: 0.223 0.133 0.211 0.117 0.221 0.094
## left son=24 (177 obs) right son=25 (258 obs)
## Primary splits:
## SpeedAttack < 53.5 to the left, improve=2.611125, (0 missing)
## Total < 329.5 to the right, improve=2.413972, (0 missing)
## Attack < 84.5 to the left, improve=1.218174, (0 missing)
## Surrogate splits:
## Total < 329.5 to the left, agree=0.717, adj=0.305, (0 split)
## Attack < 24.5 to the left, agree=0.598, adj=0.011, (0 split)
##
## Node number 13: 40 observations
## predicted class=5 expected loss=0.55 P(node) =0.0554785
## class counts: 11 3 5 3 18 0
## probabilities: 0.275 0.075 0.125 0.075 0.450 0.000
##
## Node number 24: 177 observations, complexity param=0.01179941
## predicted class=5 expected loss=0.7175141 P(node) =0.2454924
## class counts: 47 18 31 19 50 12
## probabilities: 0.266 0.102 0.175 0.107 0.282 0.068
## left son=48 (20 obs) right son=49 (157 obs)
## Primary splits:
## Attack < 43.5 to the left, improve=2.9525500, (0 missing)
## Total < 302.5 to the left, improve=1.1299440, (0 missing)
## SpeedAttack < 30.5 to the right, improve=0.9809067, (0 missing)
##
## Node number 25: 258 observations, complexity param=0.0100295
## predicted class=3 expected loss=0.7635659 P(node) =0.3578363
## class counts: 50 40 61 32 46 29
## probabilities: 0.194 0.155 0.236 0.124 0.178 0.112
## left son=50 (228 obs) right son=51 (30 obs)
## Primary splits:
## Total < 470.5 to the left, improve=1.975112, (0 missing)
## Attack < 97.5 to the right, improve=1.927243, (0 missing)
## SpeedAttack < 99.5 to the right, improve=1.389027, (0 missing)
##
## Node number 48: 20 observations
## predicted class=3 expected loss=0.55 P(node) =0.02773925
## class counts: 3 3 9 3 2 0
## probabilities: 0.150 0.150 0.450 0.150 0.100 0.000
##
## Node number 49: 157 observations, complexity param=0.01061947
## predicted class=5 expected loss=0.6942675 P(node) =0.2177531
## class counts: 44 15 22 16 48 12
## probabilities: 0.280 0.096 0.140 0.102 0.306 0.076
## left son=98 (20 obs) right son=99 (137 obs)
## Primary splits:
## Attack < 49.5 to the left, improve=1.432856, (0 missing)
## Total < 328.5 to the left, improve=1.398573, (0 missing)
## SpeedAttack < 45.5 to the left, improve=1.299995, (0 missing)
## Surrogate splits:
## Total < 261 to the left, agree=0.879, adj=0.05, (0 split)
##
## Node number 50: 228 observations, complexity param=0.0100295
## predicted class=3 expected loss=0.754386 P(node) =0.3162275
## class counts: 47 39 56 25 37 24
## probabilities: 0.206 0.171 0.246 0.110 0.162 0.105
## left son=100 (134 obs) right son=101 (94 obs)
## Primary splits:
## Attack < 59.5 to the right, improve=1.922705, (0 missing)
## Total < 308.5 to the right, improve=1.921607, (0 missing)
## SpeedAttack < 94.5 to the right, improve=1.577188, (0 missing)
## Surrogate splits:
## Total < 349 to the right, agree=0.746, adj=0.383, (0 split)
## SpeedAttack < 94.5 to the left, agree=0.601, adj=0.032, (0 split)
##
## Node number 51: 30 observations
## predicted class=5 expected loss=0.7 P(node) =0.04160888
## class counts: 3 1 5 7 9 5
## probabilities: 0.100 0.033 0.167 0.233 0.300 0.167
##
## Node number 98: 20 observations
## predicted class=1 expected loss=0.55 P(node) =0.02773925
## class counts: 9 2 4 2 3 0
## probabilities: 0.450 0.100 0.200 0.100 0.150 0.000
##
## Node number 99: 137 observations
## predicted class=5 expected loss=0.6715328 P(node) =0.1900139
## class counts: 35 13 18 14 45 12
## probabilities: 0.255 0.095 0.131 0.102 0.328 0.088
##
## Node number 100: 134 observations
## predicted class=3 expected loss=0.6940299 P(node) =0.185853
## class counts: 27 23 41 15 20 8
## probabilities: 0.201 0.172 0.306 0.112 0.149 0.060
##
## Node number 101: 94 observations, complexity param=0.0100295
## predicted class=1 expected loss=0.787234 P(node) =0.1303745
## class counts: 20 16 15 10 17 16
## probabilities: 0.213 0.170 0.160 0.106 0.181 0.170
## left son=202 (72 obs) right son=203 (22 obs)
## Primary splits:
## Total < 308.5 to the right, improve=2.505856, (0 missing)
## SpeedAttack < 84 to the right, improve=2.424867, (0 missing)
## Attack < 47.5 to the left, improve=1.831175, (0 missing)
## Surrogate splits:
## Attack < 32.5 to the right, agree=0.798, adj=0.136, (0 split)
## SpeedAttack < 55.5 to the right, agree=0.787, adj=0.091, (0 split)
##
## Node number 202: 72 observations
## predicted class=1 expected loss=0.7361111 P(node) =0.0998613
## class counts: 19 14 11 8 8 12
## probabilities: 0.264 0.194 0.153 0.111 0.111 0.167
##
## Node number 203: 22 observations
## predicted class=5 expected loss=0.5909091 P(node) =0.03051318
## class counts: 1 2 4 2 9 4
## probabilities: 0.045 0.091 0.182 0.091 0.409 0.182
rpart.plot(tree_classifier3)
title("Prediction Generation on Total, Attack, and Speed Attack")
treeprediction1 <- predict(tree_classifier, Pokemon_, type = 'class')
treeprediction2 <- predict(tree_classifier2, Pokemon_, type = 'class')
treeprediction3 <- predict(tree_classifier3, Pokemon_, type = 'class')
accuracy <- function(predictions, ground_truth) {
mean(predictions == ground_truth)
}
accuracy(treeprediction1, Pokemon_$Generation)
## [1] 0.2357836
accuracy(treeprediction2, Pokemon_$Generation)
## [1] 0.2357836
accuracy(treeprediction3, Pokemon_$Generation)
## [1] 0.3051318
library(corrplot)
x <- (Pokemon_$Attack)
y <- (Pokemon_$Total)
plot(x,y)
new_model <- lm(y~x, data=Pokemon_, which(c(Pokemon_$Generation == "1")))
title("Generation 1 Regression")
abline(new_model)
summary(new_model)
##
## Call:
## lm(formula = y ~ x, data = Pokemon_, subset = which(c(Pokemon_$Generation ==
## "1")))
##
## Residuals:
## Min 1Q Median 3Q Max
## -163.732 -53.326 -3.083 52.546 212.891
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 224.5279 17.5970 12.76 <2e-16 ***
## x 2.5162 0.2278 11.04 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 74.21 on 149 degrees of freedom
## Multiple R-squared: 0.4502, Adjusted R-squared: 0.4465
## F-statistic: 122 on 1 and 149 DF, p-value: < 2.2e-16
x <- (Pokemon_$Attack)
y <- (Pokemon_$Total)
plot(x,y)
new_model <- lm(y~x, data=Pokemon_, which(c(Pokemon_$Generation == "2")))
title("Generation 2 Regression")
abline(new_model)
summary(new_model)
##
## Call:
## lm(formula = y ~ x, data = Pokemon_, subset = which(c(Pokemon_$Generation ==
## "2")))
##
## Residuals:
## Min 1Q Median 3Q Max
## -136.72 -54.74 -11.03 60.53 285.38
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 228.6105 22.1967 10.299 < 2e-16 ***
## x 2.6014 0.3004 8.659 9.76e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 84.95 on 98 degrees of freedom
## Multiple R-squared: 0.4335, Adjusted R-squared: 0.4277
## F-statistic: 74.98 on 1 and 98 DF, p-value: 9.764e-14
x <- (Pokemon_$Attack)
y <- (Pokemon_$Total)
plot(x,y)
new_model <- lm(y~x, data=Pokemon_, which(c(Pokemon_$Generation == "3")))
title("Generation 3 Regression")
abline(new_model)
summary(new_model)
##
## Call:
## lm(formula = y ~ x, data = Pokemon_, subset = which(c(Pokemon_$Generation ==
## "3")))
##
## Residuals:
## Min 1Q Median 3Q Max
## -213.180 -54.305 -5.478 49.671 242.422
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 198.0745 17.6194 11.24 <2e-16 ***
## x 2.7901 0.2227 12.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 78.3 on 133 degrees of freedom
## Multiple R-squared: 0.5414, Adjusted R-squared: 0.5379
## F-statistic: 157 on 1 and 133 DF, p-value: < 2.2e-16
x <- (Pokemon_$Attack)
y <- (Pokemon_$Total)
plot(x,y)
new_model <- lm(y~x, data=Pokemon_, which(c(Pokemon_$Generation == "4")))
title("Generation 4 Regression")
abline(new_model)
summary(new_model)
##
## Call:
## lm(formula = y ~ x, data = Pokemon_, subset = which(c(Pokemon_$Generation ==
## "4")))
##
## Residuals:
## Min 1Q Median 3Q Max
## -216.864 -61.804 -1.131 53.968 182.490
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 227.4237 22.1120 10.29 <2e-16 ***
## x 2.7155 0.2567 10.58 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 82.12 on 105 degrees of freedom
## Multiple R-squared: 0.516, Adjusted R-squared: 0.5113
## F-statistic: 111.9 on 1 and 105 DF, p-value: < 2.2e-16
x <- (Pokemon_$Attack)
y <- (Pokemon_$Total)
plot(x,y)
new_model <- lm(y~x, data=Pokemon_, which(c(Pokemon_$Generation == "5")))
title("Generation 5 Regression")
abline(new_model)
summary(new_model)
##
## Call:
## lm(formula = y ~ x, data = Pokemon_, subset = which(c(Pokemon_$Generation ==
## "5")))
##
## Residuals:
## Min 1Q Median 3Q Max
## -132.826 -54.214 -5.366 45.088 184.131
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 226.5813 17.3425 13.06 <2e-16 ***
## x 2.4583 0.2019 12.18 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 73.24 on 154 degrees of freedom
## Multiple R-squared: 0.4905, Adjusted R-squared: 0.4872
## F-statistic: 148.3 on 1 and 154 DF, p-value: < 2.2e-16
x <- (Pokemon_$Attack)
y <- (Pokemon_$Total)
plot(x,y)
new_model <- lm(y~x, data=Pokemon_, which(c(Pokemon_$Generation == "6")))
title("Generation 6 Regression")
abline(new_model)
summary(new_model)
##
## Call:
## lm(formula = y ~ x, data = Pokemon_, subset = which(c(Pokemon_$Generation ==
## "6")))
##
## Residuals:
## Min 1Q Median 3Q Max
## -129.386 -57.096 6.184 49.251 164.826
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 189.821 26.346 7.205 5.28e-10 ***
## x 3.307 0.343 9.642 1.78e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 73.88 on 70 degrees of freedom
## Multiple R-squared: 0.5705, Adjusted R-squared: 0.5643
## F-statistic: 92.97 on 1 and 70 DF, p-value: 1.778e-14
plot2 <-ggplot(Pokemon_, aes(x=Catch_Rate, y=Total, color=isLegendary)) + geom_point() + labs(x='Catch Rate', y='Total', title = 'Plot of Total Stats by Catch Rate based on Legendary Status')
plot2
##Based on the plot, we can observe the dispersion. There are a large proportion of non-Legendary status Pokemon. Additionally, Legendary Status Pokemon have low catch rates and high total stats. There could be evidence or reason to believe that Legendary Status has a relationship with Total Stats and Catch Rate.
Catch <- as.numeric(Pokemon_$Catch_Rate)
tree_classifier <- rpart(Pokemon_$isLegendary ~ Catch, data = Pokemon_, method = 'class', control = rpart.control(maxdepth = 10, minsplit = 100))
summary(tree_classifier)
## Call:
## rpart(formula = Pokemon_$isLegendary ~ Catch, data = Pokemon_,
## method = "class", control = rpart.control(maxdepth = 10,
## minsplit = 100))
## n= 721
##
## CP nsplit rel error xerror xstd
## 1 0.7391304 0 1.0000000 1.0000000 0.14266102
## 2 0.0100000 1 0.2608696 0.2608696 0.07467724
##
## Variable importance
## Catch
## 100
##
## Node number 1: 721 observations, complexity param=0.7391304
## predicted class=False expected loss=0.06380028 P(node) =1
## class counts: 675 46
## probabilities: 0.936 0.064
## left son=2 (671 obs) right son=3 (50 obs)
## Primary splits:
## Catch < 9 to the right, improve=64.73806, (0 missing)
##
## Node number 2: 671 observations
## predicted class=False expected loss=0.005961252 P(node) =0.9306519
## class counts: 667 4
## probabilities: 0.994 0.006
##
## Node number 3: 50 observations
## predicted class=True expected loss=0.16 P(node) =0.06934813
## class counts: 8 42
## probabilities: 0.160 0.840
rpart.plot(tree_classifier)
title("Predicting Legendary using Total")
treeprediction1 <- predict(tree_classifier, Pokemon_, type = 'class')
accuracy <- function(predictions, ground_truth) {
mean(predictions == ground_truth)
}
accuracy(treeprediction1, Pokemon_$isLegendary)
## [1] 0.9833564
Total <- as.numeric(Pokemon_$Total)
tree_classifier <- rpart(Pokemon_$isLegendary ~ Total, data = Pokemon_, method = 'class', control = rpart.control(maxdepth = 10, minsplit = 100))
summary(tree_classifier)
## Call:
## rpart(formula = Pokemon_$isLegendary ~ Total, data = Pokemon_,
## method = "class", control = rpart.control(maxdepth = 10,
## minsplit = 100))
## n= 721
##
## CP nsplit rel error xerror xstd
## 1 0.673913 0 1.000000 1.000000 0.14266102
## 2 0.010000 1 0.326087 0.326087 0.08331487
##
## Variable importance
## Total
## 100
##
## Node number 1: 721 observations, complexity param=0.673913
## predicted class=False expected loss=0.06380028 P(node) =1
## class counts: 675 46
## probabilities: 0.936 0.064
## left son=2 (660 obs) right son=3 (61 obs)
## Primary splits:
## Total < 573.5 to the left, improve=63.50742, (0 missing)
##
## Node number 2: 660 observations
## predicted class=False expected loss=0 P(node) =0.9153953
## class counts: 660 0
## probabilities: 1.000 0.000
##
## Node number 3: 61 observations
## predicted class=True expected loss=0.2459016 P(node) =0.08460472
## class counts: 15 46
## probabilities: 0.246 0.754
rpart.plot(tree_classifier)
title("Predicting Legendary using Total")
treeprediction1 <- predict(tree_classifier, Pokemon_, type = 'class')
accuracy <- function(predictions, ground_truth) {
mean(predictions == ground_truth)
}
accuracy(treeprediction1, Pokemon_$isLegendary)
## [1] 0.9791956