1.14 Complete Code
1.
#Setting
up working directory
2.
setwd("C:\\SAM\\R\\SA")
3.
getwd()
4.
#Loading
data
5.
fin<-read.csv("Future-500.CSV",
na.strings = c(""))
6.
#Exploring
data
7.
head(fin,20)
8.
tail(fin, 10)
9.
str(fin)
10. summary(fin)
11. #Changing from Non-factor to
factor
12. fin$ID <- factor(fin$ID)
13. summary(fin)
14. str(fin)
15. fin$Inception <- factor(fin$Inception)
16. summary(fin)
17. str(fin)
18. fin$Profit <- factor(fin$Profit)
19. str(fin)
20. #SUB() and GSUB()
21. fin$Expenses <- gsub(" Dollars",
"", fin$Expenses)
22. head(fin)
23. fin$Expenses <- gsub(",",
"", fin$Expenses)
24. head(fin)
25. fin$Revenue <- gsub("\\$",
"", fin$Revenue)
26. head(fin)
27. fin$Revenue <- gsub(",",
"", fin$Revenue)
28. head(fin)
29. str(fin)
30. fin$Growth <- gsub("\\%",
"", fin$Growth)
31. str(fin)
32. fin$Expenses <- as.numeric(fin$Expenses)
33. fin$Revenue <- as.numeric(fin$Revenue)
34. fin$Growth <- as.numeric(fin$Growth)
35. fin$Profit <- as.numeric(as.character((fin$Profit)))
36. head(fin,24)
37. complete.cases(fin)
38. fin[!complete.cases(fin),]
39. #filtering using which() for
non-missing data
40. head(fin)
41. fin[fin$Revenue==8567910,]
42. which(fin$Revenue==8567910)
43. fin[which(fin$Revenue==8567910),]
44. fin[fin$Employees==45,]
45. fin[which(fin$Employees==45),]
46. head(fin,24)
47. fin[fin$Expenses==NA,]
48. a<-c(1,24,53,NA, 68, NA)
49. is.na(a)
50. fin[is.na(fin$Expenses),]
51. #removing data with missing data
52. fin_backup <- fin
53. fin[!complete.cases(fin),]
54. fin[is.na(fin$Industry),]
55. fin <- fin[!is.na(fin$Industry),]
56. fin
57. fin[!complete.cases(fin),]
58. #resetting Dataframe index
59. rownames(fin) <- 1:nrow(fin)
60. fin
61. rownames(fin) <- NULL
62. fin
63. #Replacing missing data with
factual analysis
64. fin[!complete.cases(fin),]
65. fin[is.na(fin$State),]
66. fin[is.na(fin$State) & fin$City ==
"New York",]
67. fin[is.na(fin$State) & fin$City ==
"New York","State"] <- "NY"
68. #Checking it with the row number
69. fin[c(11,377),]
70. fin[!complete.cases(fin),]
71. fin[is.na(fin$State) & fin$City ==
"San Francisco",]
72. fin[is.na(fin$State) & fin$City ==
"San Francisco","State"] <- "CA"
73. #Checking it with the row number
74. fin[c(82,265),]
75. fin[!complete.cases(fin),]
76. #Replacing missing data: Median
Imputation method
77. fin[!complete.cases(fin),]
78. median(fin[,"Employees"])
79. median(fin[,"Employees"], na.rm =
TRUE)
80. med_empl_retail <-
median(fin[fin$Industry=="Retail","Employees"], na.rm =
TRUE)
81. med_empl_retail
82. fin[is.na(fin$Employees) & fin$Industry
== "Retail", "Employees"] <- med_empl_retail
83. #check
84. fin[3,]
85. #Dealing with missing data in
Financial Services
86. med_empl_fs<-median(fin[fin$Industry=="Financial
Services","Employees"],na.rm=TRUE)
87. med_empl_fs
88. fin[is.na(fin$Employees) & fin$Industry
== "Financial Services", "Employees"]<- med_empl_fs
89. #check
90. fin[330,]
91. #Dealing with missing data in
Growth
92. med_growth_construction <-
median(fin[fin$Industry=="Construction","Growth"], na.rm =
TRUE)
93. med_growth_construction
94. fin[is.na(fin$Growth) & fin$Industry ==
"Construction", "Growth"] <- med_growth_construction
95. #check
96. fin[8,]
97. #Revenue
98. med_rev_constr <-
median(fin[fin$Industry=="Construction","Revenue"], na.rm =
TRUE)
99. med_rev_constr
100.
fin[is.na(fin$Revenue)
& fin$Industry == "Construction", "Revenue"] <-
med_rev_constr
101.
fin[!complete.cases(fin),]
102.
#Expenses
103.
#Be
careful here, because we are performing operation only for certain ones
104.
med_exp_constr
<- median(fin[fin$Industry=="Construction","Expenses"],
na.rm = TRUE)
105.
med_exp_constr
106.
fin[is.na(fin$Expenses)
& fin$Industry == "Construction" & is.na(fin$Profit),
"Expenses"] <- med_exp_constr
107.
fin[!complete.cases(fin),]
108.
#Replacing
missing data : Deriving values
109.
#Revenue
= Expenses – Profit
110.
#Expenses
= Revenue – Profit
111.
fin[is.na(fin$Profit),"Profit"]
<- fin[is.na(fin$Profit),"Revenue"] -
fin[is.na(fin$Profit),"Expenses"]
112.
#check
113.
fin[c(8,42),]
114.
fin[!complete.cases(fin),]
115.
fin[is.na(fin$Expenses),"Expenses"]
<- fin[is.na(fin$Expenses),"Revenue"] -
fin[is.na(fin$Expenses),"Profit"]
116.
#check
117.
fin[15,]
118.
fin[!complete.cases(fin),]
Comments
Post a Comment