R_Cleaning_Record

##install.packages("psych")
library(psych)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(corrplot)

## corrplot 0.90 loaded

#Read in the dataset
dataset_Name<-"Age by Gender.csv"
age_by_gender <- read.csv(dataset_Name, na.string=c("", " "))

##print
head(age_by_gender, n=5)

##   ID.Gender Gender ID.Age Age ID.Year Year ID.Workforce.Status Workforce.Status
## 1         1   Male     20  20    2017 2017                true             true
## 2         1   Male     21  21    2017 2017                true             true
## 3         1   Male     22  22    2017 2017                true             true
## 4         1   Male     23  23    2017 2017                true             true
## 5         1   Male     24  24    2017 2017                true             true
##   Total.Population Total.Population.MOE.Appx Record.Count PUMS.Occupation
## 1              770                  681.2696            7  Social workers
## 2              883                  729.5479            7  Social workers
## 3             1138                  828.2171           11  Social workers
## 4             2192                 1149.4546           16  Social workers
## 5             3200                 1388.8171           26  Social workers
##   ID.PUMS.Occupation Slug.PUMS.Occupation       share
## 1             211020       social-workers 0.004767654
## 2             211020       social-workers 0.005467323
## 3             211020       social-workers 0.007046221
## 4             211020       social-workers 0.013572335
## 5             211020       social-workers 0.019813628

## Have a look at the data types
str(age_by_gender)

## 'data.frame':    434 obs. of  15 variables:
##  $ ID.Gender                : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Gender                   : chr  "Male" "Male" "Male" "Male" ...
##  $ ID.Age                   : int  20 21 22 23 24 25 26 27 28 29 ...
##  $ Age                      : int  20 21 22 23 24 25 26 27 28 29 ...
##  $ ID.Year                  : int  2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 ...
##  $ Year                     : int  2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 ...
##  $ ID.Workforce.Status      : chr  "true" "true" "true" "true" ...
##  $ Workforce.Status         : chr  "true" "true" "true" "true" ...
##  $ Total.Population         : int  770 883 1138 2192 3200 3686 4533 3449 4998 5244 ...
##  $ Total.Population.MOE.Appx: num  681 730 828 1149 1389 ...
##  $ Record.Count             : int  7 7 11 16 26 33 38 33 37 44 ...
##  $ PUMS.Occupation          : chr  "Social workers" "Social workers" "Social workers" "Social workers" ...
##  $ ID.PUMS.Occupation       : int  211020 211020 211020 211020 211020 211020 211020 211020 211020 211020 ...
##  $ Slug.PUMS.Occupation     : chr  "social-workers" "social-workers" "social-workers" "social-workers" ...
##  $ share                    : num  0.00477 0.00547 0.00705 0.01357 0.01981 ...

## Get column names

(ColNames<-names(age_by_gender))

##  [1] "ID.Gender"                 "Gender"                   
##  [3] "ID.Age"                    "Age"                      
##  [5] "ID.Year"                   "Year"                     
##  [7] "ID.Workforce.Status"       "Workforce.Status"         
##  [9] "Total.Population"          "Total.Population.MOE.Appx"
## [11] "Record.Count"              "PUMS.Occupation"          
## [13] "ID.PUMS.Occupation"        "Slug.PUMS.Occupation"     
## [15] "share"

for(name in 1:length(ColNames)){
  cat(ColNames[name], "\n")
}

## ID.Gender 
## Gender 
## ID.Age 
## Age 
## ID.Year 
## Year 
## ID.Workforce.Status 
## Workforce.Status 
## Total.Population 
## Total.Population.MOE.Appx 
## Record.Count 
## PUMS.Occupation 
## ID.PUMS.Occupation 
## Slug.PUMS.Occupation 
## share

#Number of rows and columns before cleaning
(NumColumns <-ncol(age_by_gender))

## [1] 15

(NumRows <-nrow(age_by_gender))

## [1] 434

#Let's make tables of all the columns

#lapply(age_by_gender,table)  
lapply(age_by_gender,summary)

## $ID.Gender
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   1.528   2.000   2.000 
## 
## $Gender
##    Length     Class      Mode 
##       434 character character 
## 
## $ID.Age
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   33.00   47.00   46.85   60.00   81.00 
## 
## $Age
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   33.00   47.00   46.85   60.00   81.00 
## 
## $ID.Year
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2014    2014    2016    2016    2017    2017 
## 
## $Year
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2014    2014    2016    2016    2017    2017 
## 
## $ID.Workforce.Status
##    Length     Class      Mode 
##       434 character character 
## 
## $Workforce.Status
##    Length     Class      Mode 
##       434 character character 
## 
## $Total.Population
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     270    2645    4142    7692   13970   23749 
## 
## $Total.Population.MOE.Appx
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   403.4  1262.6  1580.2  1941.9  2901.6  3783.2 
## 
## $Record.Count
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00   27.00   38.00   74.56  142.75  209.00 
## 
## $PUMS.Occupation
##    Length     Class      Mode 
##       434 character character 
## 
## $ID.PUMS.Occupation
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  211020  211020  211020  211020  211020  211020 
## 
## $Slug.PUMS.Occupation
##    Length     Class      Mode 
##       434 character character 
## 
## $share
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0004017 0.0123359 0.0199890 0.0184332 0.0254779 0.0365852

#Remove columns we do not need or want
head(age_by_gender  <- subset(age_by_gender, select=-c(Gender,ID.Age,ID.Year,ID.Workforce.Status,ID.PUMS.Occupation,Slug.PUMS.Occupation)))

##   ID.Gender Age Year Workforce.Status Total.Population
## 1         1  20 2017             true              770
## 2         1  21 2017             true              883
## 3         1  22 2017             true             1138
## 4         1  23 2017             true             2192
## 5         1  24 2017             true             3200
## 6         1  25 2017             true             3686
##   Total.Population.MOE.Appx Record.Count PUMS.Occupation       share
## 1                  681.2696            7  Social workers 0.004767654
## 2                  729.5479            7  Social workers 0.005467323
## 3                  828.2171           11  Social workers 0.007046221
## 4                 1149.4546           16  Social workers 0.013572335
## 5                 1388.8171           26  Social workers 0.019813628
## 6                 1490.5516           33  Social workers 0.022822823

str(age_by_gender)

## 'data.frame':    434 obs. of  9 variables:
##  $ ID.Gender                : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Age                      : int  20 21 22 23 24 25 26 27 28 29 ...
##  $ Year                     : int  2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 ...
##  $ Workforce.Status         : chr  "true" "true" "true" "true" ...
##  $ Total.Population         : int  770 883 1138 2192 3200 3686 4533 3449 4998 5244 ...
##  $ Total.Population.MOE.Appx: num  681 730 828 1149 1389 ...
##  $ Record.Count             : int  7 7 11 16 26 33 38 33 37 44 ...
##  $ PUMS.Occupation          : chr  "Social workers" "Social workers" "Social workers" "Social workers" ...
##  $ share                    : num  0.00477 0.00547 0.00705 0.01357 0.01981 ...

# Checking for missing values
# Check the entire DF for missing values in total
head(is.na(age_by_gender) )

##   ID.Gender   Age  Year Workforce.Status Total.Population
## 1     FALSE FALSE FALSE            FALSE            FALSE
## 2     FALSE FALSE FALSE            FALSE            FALSE
## 3     FALSE FALSE FALSE            FALSE            FALSE
## 4     FALSE FALSE FALSE            FALSE            FALSE
## 5     FALSE FALSE FALSE            FALSE            FALSE
## 6     FALSE FALSE FALSE            FALSE            FALSE
##   Total.Population.MOE.Appx Record.Count PUMS.Occupation share
## 1                     FALSE        FALSE           FALSE FALSE
## 2                     FALSE        FALSE           FALSE FALSE
## 3                     FALSE        FALSE           FALSE FALSE
## 4                     FALSE        FALSE           FALSE FALSE
## 5                     FALSE        FALSE           FALSE FALSE
## 6                     FALSE        FALSE           FALSE FALSE

#lapply(age_by_gender, is.na) 

## Using an inline function and sapply (for simplify apply)
#sapply(age_by_gender, function(x) sum(is.na(x)))

#Formatting
age_by_gender$ID.Gender <- factor(age_by_gender$ID.Gender)
age_by_gender$Year <- factor(age_by_gender$Year)
age_by_gender$PUMS.Occupation <- factor(age_by_gender$PUMS.Occupation)
age_by_gender$Workforce.Status <- factor(age_by_gender$Workforce.Status)
age_by_gender$Age <- as.numeric(age_by_gender$Age)
age_by_gender$Total.Population <- as.numeric(age_by_gender$Total.Population)
age_by_gender$Record.Count <- as.numeric(age_by_gender$Record.Count)

str(age_by_gender)

## 'data.frame':    434 obs. of  9 variables:
##  $ ID.Gender                : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Age                      : num  20 21 22 23 24 25 26 27 28 29 ...
##  $ Year                     : Factor w/ 4 levels "2014","2015",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ Workforce.Status         : Factor w/ 1 level "true": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Total.Population         : num  770 883 1138 2192 3200 ...
##  $ Total.Population.MOE.Appx: num  681 730 828 1149 1389 ...
##  $ Record.Count             : num  7 7 11 16 26 33 38 33 37 44 ...
##  $ PUMS.Occupation          : Factor w/ 1 level "Social workers": 1 1 1 1 1 1 1 1 1 1 ...
##  $ share                    : num  0.00477 0.00547 0.00705 0.01357 0.01981 ...

table(age_by_gender$Age)

## 
## 18 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 
##  1  6  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8 
## 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 
##  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  7  7 
## 71 72 73 74 75 76 77 78 81 
##  6  5  6  4  3  1  1  2  1

## mean, variance 
summary(age_by_gender$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   33.00   47.00   46.85   60.00   81.00

nrow(age_by_gender)

## [1] 434

sd(age_by_gender$Age, na.rm = T)

## [1] 15.87455

pairs(age_by_gender[,c(2,5)],col = "blue")

lapply(age_by_gender,summary)

## $ID.Gender
##   1   2 
## 205 229 
## 
## $Age
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   33.00   47.00   46.85   60.00   81.00 
## 
## $Year
## 2014 2015 2016 2017 
##  110  107  105  112 
## 
## $Workforce.Status
## true 
##  434 
## 
## $Total.Population
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     270    2645    4142    7692   13970   23749 
## 
## $Total.Population.MOE.Appx
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   403.4  1262.6  1580.2  1941.9  2901.6  3783.2 
## 
## $Record.Count
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00   27.00   38.00   74.56  142.75  209.00 
## 
## $PUMS.Occupation
## Social workers 
##            434 
## 
## $share
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0004017 0.0123359 0.0199890 0.0184332 0.0254779 0.0365852

head((Temp<-age_by_gender[,c(2,5)]))

##   Age Total.Population
## 1  20              770
## 2  21              883
## 3  22             1138
## 4  23             2192
## 5  24             3200
## 6  25             3686

pairs.panels(Temp)

corr.test(Temp, method = "spearman")

## Call:corr.test(x = Temp, method = "spearman")
## Correlation matrix 
##                    Age Total.Population
## Age               1.00            -0.39
## Total.Population -0.39             1.00
## Sample Size 
## [1] 434
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##                  Age Total.Population
## Age                0                0
## Total.Population   0                0
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option

summary(age_by_gender)

##  ID.Gender      Age          Year     Workforce.Status Total.Population
##  1:205     Min.   :18.00   2014:110   true:434         Min.   :  270   
##  2:229     1st Qu.:33.00   2015:107                    1st Qu.: 2645   
##            Median :47.00   2016:105                    Median : 4142   
##            Mean   :46.85   2017:112                    Mean   : 7692   
##            3rd Qu.:60.00                               3rd Qu.:13970   
##            Max.   :81.00                               Max.   :23749   
##  Total.Population.MOE.Appx  Record.Count          PUMS.Occupation
##  Min.   : 403.4            Min.   :  5.00   Social workers:434   
##  1st Qu.:1262.6            1st Qu.: 27.00                        
##  Median :1580.2            Median : 38.00                        
##  Mean   :1941.9            Mean   : 74.56                        
##  3rd Qu.:2901.6            3rd Qu.:142.75                        
##  Max.   :3783.2            Max.   :209.00                        
##      share          
##  Min.   :0.0004017  
##  1st Qu.:0.0123359  
##  Median :0.0199890  
##  Mean   :0.0184332  
##  3rd Qu.:0.0254779  
##  Max.   :0.0365852

age_by_gender[,c(2,5,6,7)]<-lapply(age_by_gender[,c(2,5,6,7)], as.integer)

str(age_by_gender)

## 'data.frame':    434 obs. of  9 variables:
##  $ ID.Gender                : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Age                      : int  20 21 22 23 24 25 26 27 28 29 ...
##  $ Year                     : Factor w/ 4 levels "2014","2015",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ Workforce.Status         : Factor w/ 1 level "true": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Total.Population         : int  770 883 1138 2192 3200 3686 4533 3449 4998 5244 ...
##  $ Total.Population.MOE.Appx: int  681 729 828 1149 1388 1490 1652 1441 1735 1777 ...
##  $ Record.Count             : int  7 7 11 16 26 33 38 33 37 44 ...
##  $ PUMS.Occupation          : Factor w/ 1 level "Social workers": 1 1 1 1 1 1 1 1 1 1 ...
##  $ share                    : num  0.00477 0.00547 0.00705 0.01357 0.01981 ...

#Visualizations
##Pie CHart of Gender
ggplot(age_by_gender, aes(x = "", fill = ID.Gender)) + 
  geom_bar(width = 1) +
  labs(fill="ID.Gender", title="Pie Chart of Gender")+
  coord_polar(theta = "y", start=0)

## Boxplot of Age
ggplot(data = age_by_gender, aes(y = Age)) +
  geom_boxplot(varwidth=T, fill="plum") +
  labs(title="Age Boxplot",x="Age",y="Count")+
  scale_x_discrete() +
  ylim(0,100)

##Correlation
corr_data<-age_by_gender[,c(2,5,6,7)]
corr_res<-cor(corr_data)
corrplot(corr_res,type="upper",order = "hclust",title = "Correlation between Pclass and Fare")

corr_res

##                                  Age Total.Population Total.Population.MOE.Appx
## Age                        1.0000000       -0.3318712                -0.3552414
## Total.Population          -0.3318712        1.0000000                 0.9851987
## Total.Population.MOE.Appx -0.3552414        0.9851987                 1.0000000
## Record.Count              -0.2558199        0.9876180                 0.9802101
##                           Record.Count
## Age                         -0.2558199
## Total.Population             0.9876180
## Total.Population.MOE.Appx    0.9802101
## Record.Count                 1.0000000

write.table(corr_res,file = 'corr_res.txt',sep = ",",quote = FALSE,row.names = F)

##Scatterplot of Population
ggplot(age_by_gender, aes(x = Total.Population, y = 1:nrow(age_by_gender))) + 
  geom_point()+
  labs(y="Count", 
       x="Population", 
       title="Scatterplot of Population")

#Final Dataset
write.csv(age_by_gender,"cleaned_age_gender.csv", row.names = FALSE)

R_Cleaning_Record

Sonali Pednekar

29/09/2021