##install.packages("psych")
library(psych)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(corrplot)
## corrplot 0.90 loaded
#Read in the dataset
dataset_Name<-"Age by Gender.csv"
age_by_gender <- read.csv(dataset_Name, na.string=c("", " "))
##print
head(age_by_gender, n=5)
## ID.Gender Gender ID.Age Age ID.Year Year ID.Workforce.Status Workforce.Status
## 1 1 Male 20 20 2017 2017 true true
## 2 1 Male 21 21 2017 2017 true true
## 3 1 Male 22 22 2017 2017 true true
## 4 1 Male 23 23 2017 2017 true true
## 5 1 Male 24 24 2017 2017 true true
## Total.Population Total.Population.MOE.Appx Record.Count PUMS.Occupation
## 1 770 681.2696 7 Social workers
## 2 883 729.5479 7 Social workers
## 3 1138 828.2171 11 Social workers
## 4 2192 1149.4546 16 Social workers
## 5 3200 1388.8171 26 Social workers
## ID.PUMS.Occupation Slug.PUMS.Occupation share
## 1 211020 social-workers 0.004767654
## 2 211020 social-workers 0.005467323
## 3 211020 social-workers 0.007046221
## 4 211020 social-workers 0.013572335
## 5 211020 social-workers 0.019813628
## Have a look at the data types
str(age_by_gender)
## 'data.frame': 434 obs. of 15 variables:
## $ ID.Gender : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Gender : chr "Male" "Male" "Male" "Male" ...
## $ ID.Age : int 20 21 22 23 24 25 26 27 28 29 ...
## $ Age : int 20 21 22 23 24 25 26 27 28 29 ...
## $ ID.Year : int 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 ...
## $ Year : int 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 ...
## $ ID.Workforce.Status : chr "true" "true" "true" "true" ...
## $ Workforce.Status : chr "true" "true" "true" "true" ...
## $ Total.Population : int 770 883 1138 2192 3200 3686 4533 3449 4998 5244 ...
## $ Total.Population.MOE.Appx: num 681 730 828 1149 1389 ...
## $ Record.Count : int 7 7 11 16 26 33 38 33 37 44 ...
## $ PUMS.Occupation : chr "Social workers" "Social workers" "Social workers" "Social workers" ...
## $ ID.PUMS.Occupation : int 211020 211020 211020 211020 211020 211020 211020 211020 211020 211020 ...
## $ Slug.PUMS.Occupation : chr "social-workers" "social-workers" "social-workers" "social-workers" ...
## $ share : num 0.00477 0.00547 0.00705 0.01357 0.01981 ...
## Get column names
(ColNames<-names(age_by_gender))
## [1] "ID.Gender" "Gender"
## [3] "ID.Age" "Age"
## [5] "ID.Year" "Year"
## [7] "ID.Workforce.Status" "Workforce.Status"
## [9] "Total.Population" "Total.Population.MOE.Appx"
## [11] "Record.Count" "PUMS.Occupation"
## [13] "ID.PUMS.Occupation" "Slug.PUMS.Occupation"
## [15] "share"
for(name in 1:length(ColNames)){
cat(ColNames[name], "\n")
}
## ID.Gender
## Gender
## ID.Age
## Age
## ID.Year
## Year
## ID.Workforce.Status
## Workforce.Status
## Total.Population
## Total.Population.MOE.Appx
## Record.Count
## PUMS.Occupation
## ID.PUMS.Occupation
## Slug.PUMS.Occupation
## share
#Number of rows and columns before cleaning
(NumColumns <-ncol(age_by_gender))
## [1] 15
(NumRows <-nrow(age_by_gender))
## [1] 434
#Let's make tables of all the columns
#lapply(age_by_gender,table)
lapply(age_by_gender,summary)
## $ID.Gender
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 1.528 2.000 2.000
##
## $Gender
## Length Class Mode
## 434 character character
##
## $ID.Age
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 33.00 47.00 46.85 60.00 81.00
##
## $Age
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 33.00 47.00 46.85 60.00 81.00
##
## $ID.Year
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2014 2014 2016 2016 2017 2017
##
## $Year
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2014 2014 2016 2016 2017 2017
##
## $ID.Workforce.Status
## Length Class Mode
## 434 character character
##
## $Workforce.Status
## Length Class Mode
## 434 character character
##
## $Total.Population
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 270 2645 4142 7692 13970 23749
##
## $Total.Population.MOE.Appx
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 403.4 1262.6 1580.2 1941.9 2901.6 3783.2
##
## $Record.Count
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.00 27.00 38.00 74.56 142.75 209.00
##
## $PUMS.Occupation
## Length Class Mode
## 434 character character
##
## $ID.PUMS.Occupation
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 211020 211020 211020 211020 211020 211020
##
## $Slug.PUMS.Occupation
## Length Class Mode
## 434 character character
##
## $share
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0004017 0.0123359 0.0199890 0.0184332 0.0254779 0.0365852
#Remove columns we do not need or want
head(age_by_gender <- subset(age_by_gender, select=-c(Gender,ID.Age,ID.Year,ID.Workforce.Status,ID.PUMS.Occupation,Slug.PUMS.Occupation)))
## ID.Gender Age Year Workforce.Status Total.Population
## 1 1 20 2017 true 770
## 2 1 21 2017 true 883
## 3 1 22 2017 true 1138
## 4 1 23 2017 true 2192
## 5 1 24 2017 true 3200
## 6 1 25 2017 true 3686
## Total.Population.MOE.Appx Record.Count PUMS.Occupation share
## 1 681.2696 7 Social workers 0.004767654
## 2 729.5479 7 Social workers 0.005467323
## 3 828.2171 11 Social workers 0.007046221
## 4 1149.4546 16 Social workers 0.013572335
## 5 1388.8171 26 Social workers 0.019813628
## 6 1490.5516 33 Social workers 0.022822823
str(age_by_gender)
## 'data.frame': 434 obs. of 9 variables:
## $ ID.Gender : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Age : int 20 21 22 23 24 25 26 27 28 29 ...
## $ Year : int 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 ...
## $ Workforce.Status : chr "true" "true" "true" "true" ...
## $ Total.Population : int 770 883 1138 2192 3200 3686 4533 3449 4998 5244 ...
## $ Total.Population.MOE.Appx: num 681 730 828 1149 1389 ...
## $ Record.Count : int 7 7 11 16 26 33 38 33 37 44 ...
## $ PUMS.Occupation : chr "Social workers" "Social workers" "Social workers" "Social workers" ...
## $ share : num 0.00477 0.00547 0.00705 0.01357 0.01981 ...
# Checking for missing values
# Check the entire DF for missing values in total
head(is.na(age_by_gender) )
## ID.Gender Age Year Workforce.Status Total.Population
## 1 FALSE FALSE FALSE FALSE FALSE
## 2 FALSE FALSE FALSE FALSE FALSE
## 3 FALSE FALSE FALSE FALSE FALSE
## 4 FALSE FALSE FALSE FALSE FALSE
## 5 FALSE FALSE FALSE FALSE FALSE
## 6 FALSE FALSE FALSE FALSE FALSE
## Total.Population.MOE.Appx Record.Count PUMS.Occupation share
## 1 FALSE FALSE FALSE FALSE
## 2 FALSE FALSE FALSE FALSE
## 3 FALSE FALSE FALSE FALSE
## 4 FALSE FALSE FALSE FALSE
## 5 FALSE FALSE FALSE FALSE
## 6 FALSE FALSE FALSE FALSE
#lapply(age_by_gender, is.na)
## Using an inline function and sapply (for simplify apply)
#sapply(age_by_gender, function(x) sum(is.na(x)))
#Formatting
age_by_gender$ID.Gender <- factor(age_by_gender$ID.Gender)
age_by_gender$Year <- factor(age_by_gender$Year)
age_by_gender$PUMS.Occupation <- factor(age_by_gender$PUMS.Occupation)
age_by_gender$Workforce.Status <- factor(age_by_gender$Workforce.Status)
age_by_gender$Age <- as.numeric(age_by_gender$Age)
age_by_gender$Total.Population <- as.numeric(age_by_gender$Total.Population)
age_by_gender$Record.Count <- as.numeric(age_by_gender$Record.Count)
str(age_by_gender)
## 'data.frame': 434 obs. of 9 variables:
## $ ID.Gender : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ Age : num 20 21 22 23 24 25 26 27 28 29 ...
## $ Year : Factor w/ 4 levels "2014","2015",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ Workforce.Status : Factor w/ 1 level "true": 1 1 1 1 1 1 1 1 1 1 ...
## $ Total.Population : num 770 883 1138 2192 3200 ...
## $ Total.Population.MOE.Appx: num 681 730 828 1149 1389 ...
## $ Record.Count : num 7 7 11 16 26 33 38 33 37 44 ...
## $ PUMS.Occupation : Factor w/ 1 level "Social workers": 1 1 1 1 1 1 1 1 1 1 ...
## $ share : num 0.00477 0.00547 0.00705 0.01357 0.01981 ...
table(age_by_gender$Age)
##
## 18 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
## 1 6 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
## 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
## 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 7 7
## 71 72 73 74 75 76 77 78 81
## 6 5 6 4 3 1 1 2 1
## mean, variance
summary(age_by_gender$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 33.00 47.00 46.85 60.00 81.00
nrow(age_by_gender)
## [1] 434
sd(age_by_gender$Age, na.rm = T)
## [1] 15.87455
pairs(age_by_gender[,c(2,5)],col = "blue")
lapply(age_by_gender,summary)
## $ID.Gender
## 1 2
## 205 229
##
## $Age
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 33.00 47.00 46.85 60.00 81.00
##
## $Year
## 2014 2015 2016 2017
## 110 107 105 112
##
## $Workforce.Status
## true
## 434
##
## $Total.Population
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 270 2645 4142 7692 13970 23749
##
## $Total.Population.MOE.Appx
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 403.4 1262.6 1580.2 1941.9 2901.6 3783.2
##
## $Record.Count
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.00 27.00 38.00 74.56 142.75 209.00
##
## $PUMS.Occupation
## Social workers
## 434
##
## $share
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0004017 0.0123359 0.0199890 0.0184332 0.0254779 0.0365852
head((Temp<-age_by_gender[,c(2,5)]))
## Age Total.Population
## 1 20 770
## 2 21 883
## 3 22 1138
## 4 23 2192
## 5 24 3200
## 6 25 3686
pairs.panels(Temp)
corr.test(Temp, method = "spearman")
## Call:corr.test(x = Temp, method = "spearman")
## Correlation matrix
## Age Total.Population
## Age 1.00 -0.39
## Total.Population -0.39 1.00
## Sample Size
## [1] 434
## Probability values (Entries above the diagonal are adjusted for multiple tests.)
## Age Total.Population
## Age 0 0
## Total.Population 0 0
##
## To see confidence intervals of the correlations, print with the short=FALSE option
summary(age_by_gender)
## ID.Gender Age Year Workforce.Status Total.Population
## 1:205 Min. :18.00 2014:110 true:434 Min. : 270
## 2:229 1st Qu.:33.00 2015:107 1st Qu.: 2645
## Median :47.00 2016:105 Median : 4142
## Mean :46.85 2017:112 Mean : 7692
## 3rd Qu.:60.00 3rd Qu.:13970
## Max. :81.00 Max. :23749
## Total.Population.MOE.Appx Record.Count PUMS.Occupation
## Min. : 403.4 Min. : 5.00 Social workers:434
## 1st Qu.:1262.6 1st Qu.: 27.00
## Median :1580.2 Median : 38.00
## Mean :1941.9 Mean : 74.56
## 3rd Qu.:2901.6 3rd Qu.:142.75
## Max. :3783.2 Max. :209.00
## share
## Min. :0.0004017
## 1st Qu.:0.0123359
## Median :0.0199890
## Mean :0.0184332
## 3rd Qu.:0.0254779
## Max. :0.0365852
age_by_gender[,c(2,5,6,7)]<-lapply(age_by_gender[,c(2,5,6,7)], as.integer)
str(age_by_gender)
## 'data.frame': 434 obs. of 9 variables:
## $ ID.Gender : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ Age : int 20 21 22 23 24 25 26 27 28 29 ...
## $ Year : Factor w/ 4 levels "2014","2015",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ Workforce.Status : Factor w/ 1 level "true": 1 1 1 1 1 1 1 1 1 1 ...
## $ Total.Population : int 770 883 1138 2192 3200 3686 4533 3449 4998 5244 ...
## $ Total.Population.MOE.Appx: int 681 729 828 1149 1388 1490 1652 1441 1735 1777 ...
## $ Record.Count : int 7 7 11 16 26 33 38 33 37 44 ...
## $ PUMS.Occupation : Factor w/ 1 level "Social workers": 1 1 1 1 1 1 1 1 1 1 ...
## $ share : num 0.00477 0.00547 0.00705 0.01357 0.01981 ...
#Visualizations
##Pie CHart of Gender
ggplot(age_by_gender, aes(x = "", fill = ID.Gender)) +
geom_bar(width = 1) +
labs(fill="ID.Gender", title="Pie Chart of Gender")+
coord_polar(theta = "y", start=0)
## Boxplot of Age
ggplot(data = age_by_gender, aes(y = Age)) +
geom_boxplot(varwidth=T, fill="plum") +
labs(title="Age Boxplot",x="Age",y="Count")+
scale_x_discrete() +
ylim(0,100)
##Correlation
corr_data<-age_by_gender[,c(2,5,6,7)]
corr_res<-cor(corr_data)
corrplot(corr_res,type="upper",order = "hclust",title = "Correlation between Pclass and Fare")
corr_res
## Age Total.Population Total.Population.MOE.Appx
## Age 1.0000000 -0.3318712 -0.3552414
## Total.Population -0.3318712 1.0000000 0.9851987
## Total.Population.MOE.Appx -0.3552414 0.9851987 1.0000000
## Record.Count -0.2558199 0.9876180 0.9802101
## Record.Count
## Age -0.2558199
## Total.Population 0.9876180
## Total.Population.MOE.Appx 0.9802101
## Record.Count 1.0000000
write.table(corr_res,file = 'corr_res.txt',sep = ",",quote = FALSE,row.names = F)
##Scatterplot of Population
ggplot(age_by_gender, aes(x = Total.Population, y = 1:nrow(age_by_gender))) +
geom_point()+
labs(y="Count",
x="Population",
title="Scatterplot of Population")
#Final Dataset
write.csv(age_by_gender,"cleaned_age_gender.csv", row.names = FALSE)