'Study/R' 카테고리의 글 목록

Study/R

R-10 2020.09.05
R-9 2020.09.05
R-8 2020.09.05
R-7 2020.09.05
R-6 2020.09.05
R-5 2020.09.05
R-4 2020.09.05
R-3 2020.09.05

R-10

2020. 9. 5. 14:36

728x90

p <- c(3,5,6,8)
q <- c(3,3,3)
p+q#
#recycling
#[1]  6  8  9 11
#경고메시지(들):
#  In p + q : 두 객체의 길이가 서로 배수관계에 있지 않습니다
p <- c(3,5,6,8)
q <- c(2,3,4)
p+q#
#recycling
#[1]  6  8  9 11
#경고메시지(들):
#  In p + q : 두 객체의 길이가 서로 배수관계에 있지 않습니다
#마지막것은 처음것으로 더한다.
#고수준 : 혼자 그릴수 있다.
#저수준 :그림이 있어야 그릴수 있다.

plot(airquality)#그림을 박스별로 그리는 것

#상자수염은 이상치 찾을때

library(readxl)
library(dplyr)
BostomHousing <- read_excel("BostonHousing.xls")
str(BostomHousing)
data <- tbl_df(BostomHousing)#데이터의 형태를 바꾼다. 이렇게 바꾸면 큰 데이터 프레임을 보기 쉽도록 해 준다.
#tbl_df -> as_tibble를 불러온다.
str(data)

View(data)
glimpse(data)

data(package ="MASS")
names(data)


library(psych)
pairs.panels(data[names(data)])#데이터 있는 것들을 이름을 가져와서 
#상관간계 보여 주는 것 
#0.70 과 -0.74
#rm lstat와 관계있다.
pairs.panels(data)
#plot에서 상관관계 
data <- data[ , -15]#오류난다.
data

names(data)
names(data) <- tolower(names(data))#소문자로 바꿔준다.
data_lm <- lm(medv ~ ., data = data)
#medv가격을 나타내는 주기 
#.한꺼번에 다 너주기 
data_lm
#이것은 작을 수록 좋다.

#excel cliboard에서 읽는 방법
x <- readClipboard()
read.table(file="clipboard", sep ="\t" , header = T)
read.clipboard()
#-----------------------------

#의사결정나무는 기초
data <- read.csv("churn.csv")

#결측치 있는 지 확인 
#sapply(data, function(x) sum(is.na(data) ))

str(data)
library(caret)
set.seed(1234)
#names(data) <- tolower(names(data))
#names(data)
names(data)[21] <- "churn"
index <- createDataPartition(y = data$churn, p = 0.7, list = F)
train <- data[index,]
test <- data[-index,]
3333*0.7
str(train)#2334

library(rpart)
library(rpart.plot)

rpart_tree <- rpart(churn ~. , train[,c(-1,-4)])# 첫번째 열을 stat -1빼고 -4 phone 열을 빼고 나무 만들었다.
rpart_tree <- rpart(churn ~. , train[,c(-1,-4 )] ,
                    control= rpart.control(minsplit =  10, 
                                           minbucket = 3, 
                                           cp = 0.03, 
                                           maxdepth = 10))
plot(rpart_tree)
text(rpart_tree, cex = 1.5)#나무 만들기
rpart.plot(rpart_tree, cex =1 )

#예측
t <- test[2, ]#필요한 정보만 주면 되기때문에 안맞아도 문제 없다.
predict(rpart_tree , newdata = t )#t라는 분이 해지 할지 안할지 
rpart_tree
printcp(rpart_tree)
plotcp(rpart_tree)

install.packages("rattle")
library(rattle)

fancyRpartPlot(rpart_tree)
fancyRpartPlot(rpart_tree, cex = 0.6)
rpart.plot(rpart_tree, cex = 1)

rpart_pred <- predict(rpart_tree, test[,c(-1,-4)], type="class")
library(caret)
confusionMatrix(rpart_pred, test$churn, positive ="True.")

rpart_tree$cptable
rpart.prune <- prune(rpart_tree, cp = 0.05, "CP")#사후적인 가지치기 
rpart.plot(rpart.prune)
rpart.plot(rpart.prune, cex = 0.7)

rpart_pred2 <- predict(rpart.prune,test[,c(-1,-4)], type="class")
confusionMatrix(rpart_pred2, test$churn, positive ="True.")

#accuracy높다고 좋은 것이 아니다. 
install.packages("party")
library(party)
party_tree <- ctree(churn ~ ., train[,c(-1,-4)])
plot(party_tree)

party_pred <- predict(party_tree, test[,c(-1,-4)],type="response")
confusionMatrix(party_pred,test$churn, positive = "True.")

confusionMatrix(rpart_pred2, test$churn, positive = "True.")$overall
confusionMatrix(party_pred, test$churn, positive = "True.")$overall
#---------------
#---------------
#연관성 규칙 
install.packages("arules")
install.packages("arulesViz")
install.packages("wordcloud")#wordcloud2한다.


library(arules)
library(arulesViz)
library(wordcloud)

data("Groceries")
summary(Groceries)
str(Groceries)
class(Groceries)
#"transactions"
#transactions 은 다룰 수 없다.그래서 as.data.frame으로 

#데이터 프레임으로 변환
groceries_df <- as(Groceries,"data.frame")#as class를 class이름 이 들어간다.
#data.frame
class(groceries_df)

itemName <- itemLabels(Groceries)
itemCount <- itemFrequency(Groceries)*9835

#워드 클라우드 
groceries_df
col <- brewer.pal(8, "Dark2") 
#x11()
wordcloud(words = itemName , ffreq = itemCount,min.freq = 1,
          scale = c(3, 0.2) , col = col, random.order = F)

#apriori 함수로 규칙 생성 
rules <- apriori(Groceries, parameter = list(support=0.01 , confidence = 0.35))

#생성된 규칙 검사
summary(rules)
inspect(rules)
inspect(rules, by ="lift")

getwd()
write.csv(as(rules, "data.frame"),"Geoceries_rules")
x11()

#규칙을  plot으로 표현하기 
plot(rules, method="scatterplot")
plotly_arules(rules, method ="scatterplot",
              measure = c("support","confidence"),shading = "lift")
plotly_arules(rules, method = "matrix",
              measure = c("support","confidence"),shading = "lift")
#연관 규칙 요구르트를 사면 야채를 산다 등 연관관계를 찾아내는 것 
#연관규칙 분석 
#산것은 1안산것은  0이다.

#지지도 전체에서 
#신뢰도 빵을 샀을때 우유가 등장한다. 지지도가 1이면 세트일 가능성이 있다. 
#향상도 빵을 사면 우유를 산다. 우유는 무조건 산다.지지도가 그부분을 설명 할 수 없다.

'Study > R' 카테고리의 다른 글

R-9 (0)	2020.09.05
R-8 (0)	2020.09.05
R-7 (0)	2020.09.05
R-6 (0)	2020.09.05
R-5 (0)	2020.09.05

R-9

2020. 9. 5. 14:29

728x90

library(ggplot2)
library(dplyr)
library(RColorBrewer)
library(ggthemes)
install.packages("ggrepel")
library(ggrepel)
install.packages("chron")
library(chron)
library(lubridate)
install.packages("xts")
library(xts)
install.packages("highcharter")
library(highcharter)

data <- read.csv("ctrucks_clean.csv",stringsAsFactors = F)
head(data)
str(data)
summary(data)

data$origin = as.factor(data$origin)#출발지 
data$supplier = as.factor(data$supplier)
data$Date = ymd(data$Date)#ymd형태로"2017-04-10"
data$month = month(data$month, label = T)
data$week_day = weekdays(data$Date, abbreviate = T)#요일 표현한다.
#열이 있으면 보여주고 없으면 만들어준다.
#MUTATE

#지난 3년간 가장 많은 수소을 담당한
str(data)
#트력 아이디 별로 빈도소 찾아서 내림차순별로
#1.
truck <- data %>% group_by(truckid) %>% summarise(freq= n()) %>% top_n(20)

library(ggplot2)
library(dplyr)
library(RColorBrewer)
library(ggthemes)
install.packages("ggrepel")
library(ggrepel)
install.packages("chron")
library(chron)
library(lubridate)
install.packages("xts")
library(xts)
install.packages("highcharter")
library(highcharter)

data <- read.csv("ctrucks_clean.csv",stringsAsFactors = F)
head(data)
str(data)
summary(data)

data$origin = as.factor(data$origin)#출발지 
data$supplier = as.factor(data$supplier)
data$Date = ymd(data$Date)#ymd형태로"2017-04-10"
data$month = month(data$month, label = T)
data$week_day = weekdays(data$Date, abbreviate = T)#요일 표현한다.
#열이 있으면 보여주고 없으면 만들어준다.
#MUTATE

#지난 3년간 가장 많은 수소을 담당한
str(data)
#트력 아이디 별로 빈도소 찾아서 내림차순별로
#1.
truck <- data %>% group_by(truckid) %>% summarise(freq= n()) %>% top_n(20)
truck
data %>% filter(truckid =='1073FL01')
data %>% filter(truckid =='7922GR01')
#2.
truck1 <- data %>% group_by(truckid) %>% summarise(freq = n()) %>% arrange(desc(freq)) %>% head(20)
truck1

truck3 <- data %>%
  group_by(truckid) %>%
  summarize(freq = n()) %>%   
  arrange(desc(freq)) %>%  head(20) %>% as.data.frame()
truck3

#reorder 재정렬 하는 것이다.
ggplot(data = truck, aes(reorder(truckid, freq), y= freq, fill= freq))+
  geom_bar(stat ="identity",position="dodge")+xlab("Truck")+ylab("Frequency")+coord_flip()

#15를 25로 바꿀 수 있다.
par(mfrow= c(1,2))
ggplot(data = truck, aes(reorder(truckid, freq), y= freq, fill= freq))+
  geom_bar(stat ="identity",position="dodge")+xlab("Truck")+ylab("Frequency")+
 scale_fill_gradientn(name ="",
                    colours =rev(brewer.pal(10, 'Spectral')))+
  geom_text(aes(label = freq ), hjust= 0.5, size = 5.5)+
  ggtitle("The top 20 Trucks with the most deliveries")+
  theme( legend.position = 'none',
         axis.title.y = element_text(size = 15),
         axis.text.y = element_text(size = 15),
         axis.title.x = element_text(size = 15),
         axis.text.x= element_text(size = 15),
         plot.title = element_text(size = 15, hjust = 0.2))+coord_flip()
#scale_fill_gradientn
#coord_flip
#brewer.pal

ggplot(data = truck, aes(reorder(truckid, freq), y= freq, fill= freq))+
  geom_bar(stat ="identity",position="dodge")+xlab("Truck")+ylab("Frequency")+
  scale_fill_gradientn(name ="",
                       colours =rev(brewer.pal(10, 'Set3')))+
  geom_text(aes(label = freq ), hjust= 0.5, size = 5.5)+#옆에 있는 빈도수 바꾸기 
  ggtitle("The top 20 Trucks with the most deliveries")+
  theme( legend.position = 'none',
         axis.title.y = element_text(size = 25),
         axis.text.y = element_text(size = 25),
         axis.title.x = element_text(size = 25),
         axis.text.x= element_text(size = 25),
         plot.title = element_text(size = 25, hjust = 0.2))+coord_flip()
#legend.position d없을 경우 legend 없다.
display.brewer.all()
#ggplot도 함수이다.

data %>%
  group_by(truckid) %>%
  summarize(freq = n()) %>%   
  arrange(desc(freq)) %>%  head(20) %>% as.data.frame()+
ggplot(data,aes(reorder(truckid, freq), y= freq, fill= freq))+
  geom_bar(stat ="identity",position="dodge")+xlab("Truck")+ylab("Frequency")+
  scale_fill_gradientn(name ="",
                       colours =rev(brewer.pal(10, 'Set3')))+
  geom_text(aes(label = freq ), hjust= 0.5, size = 5.5)+#옆에 있는 빈도수 바꾸기 
  ggtitle("The top 20 Trucks with the most deliveries")+
  theme( legend.position = 'none',
         axis.title.y = element_text(size = 25),
         axis.text.y = element_text(size = 25),
         axis.title.x = element_text(size = 25),
         axis.text.x= element_text(size = 25),
         plot.title = element_text(size = 25, hjust = 0.2))+coord_flip()

data %>%
  group_by(truckid) %>%
  summarize(freq = n()) %>%   
  arrange(desc(freq)) %>%  head(20) %>% as.data.frame() %>% 
  ggplot(aes(reorder(truckid, freq), y= freq, fill= freq))+
  geom_bar(stat ="identity",position="dodge")+xlab("Truck")+ylab("Frequency")+
  scale_fill_gradientn(name ="",
                       colours =rev(brewer.pal(10, 'Set3')))+
  geom_text(aes(label = freq ), hjust= 0.5, size = 5.5)+#옆에 있는 빈도수 바꾸기 
  ggtitle("The top 20 Trucks with the most deliveries")+
  theme( legend.position = 'none',
         axis.title.y = element_text(size = 25),
         axis.text.y = element_text(size = 25),
         axis.title.x = element_text(size = 25),
         axis.text.x= element_text(size = 25),
         plot.title = element_text(size = 25, hjust = 0.2))+coord_flip() 



data %>% group_by(week_day,month) %>% count()

data %>% group_by(week_day,month) %>% count() %>% 
  ggplot(aes(reorder(week_day, n ), y = n , fill = n ))+
  geom_bar(stat ="identity",position ="dodge")+
  xlab("Day")+
  ylab("Delivery Count")+
  scale_fill_gradientn(name ="",
                       colours =rev(brewer.pal(10, 'Spectral')))+
  geom_text(aes(label = n ), hjust=- 0.3, size = 3.5)+#옆에 있는 빈도수 바꾸기 
  ggtitle("Delivery Counts through the Days by Month")+
  facet_wrap( ~ month)+
  theme( legend.position = 'none',
         axis.title.y = element_text(size = 15),
         axis.text.y = element_text(size = 10),
         axis.title.x = element_text(size = 15),
         axis.text.x= element_text(size = 10),
         plot.title = element_text(size = 25, hjust = 0.2),
         strip.text = element_text(size = 8, face ="bold"),
         strip.background = element_rect(colour = "red",fill="#CCCCFF")) 


data %>% group_by(week_day,month) %>% count() %>% 
  ggplot(aes(reorder(week_day, n ), y = n , fill = n ))+
  geom_bar(stat ="identity",position ="dodge")+
  xlab("Day")+
  ylab("Delivery Count")+
  scale_fill_gradientn(name ="",
                       colours =rev(brewer.pal(10, 'Spectral')))+
  geom_text(aes(label = n ), hjust=- 0.3, size = 3.5)+#옆에 있는 빈도수 바꾸기 
  ggtitle("Delivery Counts through the Days by Month")+
  facet_wrap( ~ month)+
  theme( legend.position = 'none',
         axis.title.y = element_text(size = 15),
         axis.text.y = element_text(size = 10),
         axis.title.x = element_text(size = 15),
         axis.text.x= element_text(size = 10),
         plot.title = element_text(size = 25, hjust = 0.2),
         strip.text = element_text(size = 15, face ="bold"),
         strip.background = element_rect(colour = "red",fill="#CCCCFF")) +coord_flip() 
#coord_flip() 옆으로 펼쳐진다.

data$month

data %>%filter(month != 2 & month != 6 & month != 7)%>% group_by(week_day,month)%>% count() %>% 
  ggplot(aes(reorder(week_day, n ), y = n , fill = n ))+
  geom_bar(stat ="identity",position ="dodge")+
  xlab("Day")+
  ylab("Delivery Count")+
  scale_fill_gradientn(name ="",
                       colours =rev(brewer.pal(10, 'Spectral')))+
  geom_text(aes(label = n ), hjust=- 0.3, size = 3.5)+#옆에 있는 빈도수 바꾸기 
  ggtitle("Delivery Counts through the Days by Month")+
  facet_wrap( ~ month)+
  theme( legend.position = 'none',
         axis.title.y = element_text(size = 15),
         axis.text.y = element_text(size = 10),
         axis.title.x = element_text(size = 15),
         axis.text.x= element_text(size = 10),
         plot.title = element_text(size = 25, hjust = 0.2),
         strip.text = element_text(size = 8, face ="bold"),
         strip.background = element_rect(colour = "red",fill="#CCCCFF")) 
# facet_wrap( ~ month)옆으로 펼치기
#month ==3 | month == 4 | month == 5
data %>%filter(! month %in% c(2,6,7))%>% group_by(week_day,month)%>% count() %>% 
  ggplot(aes(reorder(week_day, n ), y = n , fill = n ))+
  geom_bar(stat ="identity",position ="dodge")+
  xlab("Day")+
  ylab("Delivery Count")+
  scale_fill_gradientn(name ="",
                       colours =rev(brewer.pal(10, 'Spectral')))+
  geom_text(aes(label = n ), hjust=- 0.3, size = 3.5)+#옆에 있는 빈도수 바꾸기 
  ggtitle("Delivery Counts through the Days by Month")+
  facet_wrap( ~ month)+
  theme( legend.position = 'none',
         axis.title.y = element_text(size = 15),
         axis.text.y = element_text(size = 10),
         axis.title.x = element_text(size = 15),
         axis.text.x= element_text(size = 10),
         plot.title = element_text(size = 25, hjust = 0.2),
         strip.text = element_text(size = 8, face ="bold"),
         strip.background = element_rect(colour = "red",fill="#CCCCFF")) 
#색상을 fill로 채워야 한다.

data$Date

by_date_2015 <- data %>% filter(Date <= ('2016-01-01')) %>% 
  group_by(Date) %>% summarise(Total = n())
by_date_2015

time_series = xts(by_date_2015$Total, order.by = by_date_2015$Date)
hchart(time_series, name ="Number of Deliberies") %>% 
  hc_title(text = "Total Deliveries for 2015") %>% 
  hc_legend(enabled = T)

by_date_2016 = na.omit(data) %>% filter(Date >= ('2016-01-01') & Date <('2017-01-01')) %>% 
  group_by(Date) %>% 
  summarise(Total = n())

time_series = xts(by_date_2016$Total, order.by = by_date_2016$Date)

hchart(time_series, name ="Number of Deliveries") %>% 
  hc_add_theme(hc_theme_darkunica()) %>% 
  hc_title(text ="Total Deliveries for 2016") %>% 
  hc_legend(enaled = T)

churn데이터 고객이탈 데이터
어떤 분이 해지 하는지 알고 싶을때
#상관관계 확인 126페이지
#상관관계가 1이 나오면 낮동안 통화한 데이터 ,낮시간 금액
#예시: ‚cars‛ 데이터(내장 데이터)
#• 상관관계 확인: 회귀분석은 기본적으로 상관관계에 대핚 통계적 분석
#• 인과관계를 주장하려면 추가적으로 통계 외적인 귺거가 필요 (예: 해당 분야의 이롞적 귺거, 실험의 경우 변인통제, …)

Coefficient 관계
기무가설이 기각이 되면 상관관계가 없지 않아요
speed 변수는 값이 매우 작으므로 유의하다.

cars
data1 <- cars#151
plot(data1)
library(ggplot2)
ggplot(data = data1, aes(x= speed, y= dist ))+geom_point()#153

cor(data1)#상관행렬 
cor.test(data1$speed,data1$dist)

install.packages("corrplot")
library(corrplot)
#dataframe의 cor값이 들어간다.cor(data1)
corrplot::corrplot.mixed(cor(data1))

data2 <- airquality
corrplot::corrplot.mixed(cor(data2))

data3 <- mpg
corrplot::corrplot.mixed(cor(data3))#Error in cor(data3) : 'x' must be numeric
corrplot::corrplot.mixed(cor(mtcars))

fit.cars <- lm(dist ~ speed , data = cars)
fit.cars

summary(fit.cars)
#별표가 붙어 있다. signif 무의미한 
#0에 가까운 것을 잘 띄울수록 별표 표시해주고 
#별새계가 중요하다

잔차분석
상관관계 1이면 선에 붙어있다.
잔차 원래 데이터 문제를 잔차가지고 정규분포한다.
잔차들이 일정한 모습을 가져야 하면 안된다.
잔차는 아무것이나 random하게 해야 한다.

new <- data.frame(speed = c(122))
predict(fit.cars, newdata = new ,interval = "confidence")
predict(fit.cars, newdata = new ,interval = "confidence",level= 0.9)
predict(fit.cars, newdata = new ,interval = "prediction")
#이선자체가 맞다는 보장성이 없어서 prediction 
# confidence 범위보다 넓다.

fit <- lm( dist ~ speed, data = cars)
fit
plot(fit)

par(mfrow= c(2,2))#multi figure
plot(fit)
#fitted 잔차는 0이다. random하게 영향성이 없이 잘 분포되여있다. 빨간선
#빨간선이 포물선처럼 되면 별로 좋지 않다.
#q-q plot qiantiles 45도 회귀의 기울기로 점들이 아래위로 하면서 따라간다.숫자는 이상치가 있다 윗부분이 잘 안 딸아간다.
#scale-Location scale표준화 random하게 있어줘야 한다.
#Residuals vs Leverage 잔차 0.5 안쪽으로 벗어나지 말아야 한다. 빨간색 안쪽으로 그 사이에 있으면 된다.


#Fitted values
#1번 그림 가지고 하면 된다.
plot(fit.cars, 1)
#잒차도를 확인해보니 적합값 (Fitted values)이 커질수록 잒차가 넓게 퍼지는 경향이 보인다. 
install.packages("lmtest")
library(lmtest)
install.packages("car")
library(car)
#잔차분석 – 등분산성 검정 
#규칙없이 흩어져야 한다.
bptest(fit.cars)
#잔차분석을 할 때 잔차도를 보고 판단하는 것 뿐만 아니라 잒차에 대핚 검정 또핚 가능하다.  
#유의확률(p-value)이 작으면 등분산성 가정에 문제가 있을 수 있다 (등분산성 가정을 의심해봐야 핚다). 
ncvTest(fit.cars)#car library
#p-value = 0.07297->기무가설을 체택하는 쪽으로 선택한다.
#p = 0.031049
#p값이 작으면 문제 있따.기무가설은 등분산성 가정에 문제가 있을 수 있다.
# 0.03 <0.05보다 작으니깐 조심해야 한다.

잔차분석을 통해 회귀분석의 가정에 문제가 있는 것을 확인했다면 , 그다음에는 어떻게 해야 하는가 ?-->회귀분석 다시 해야 한다.
데이터 변환을 해야 한다.->171페이제
2차곡선이 그려지게 점들이 있다. 2차의 곡선을 찾아줘야 한다.
y는 dist의 제곱하고 root를 씌워져서 값이 줄어든다. root로 해서 새로운 열을 만들어준다.

#잔차분석 -정규성 검정 
#데이터로 보여주는  정규성
plot(cars,2)
shapiro.test(fit.cars$res)#169
#0.02152 
#유의확률(p-value)이 작으면 정규성 가정에 문제가 있을 수 있다 
#(정규성 가정을 의심해봐야 핚다). 
#잔차를 확인 해야 한다. 잔차는 자주 사용해야한다.
#작은 것이 아니락 크야 한다.

data2 <- cars
#덮어씌우는 것
#data2$dist <- sqrt(data2$dist)
#colnames(data2)[2] <- "sqrt.dist"
#새로운 열을 만드는 것
data2$sqrt.dist <- sqrt(data2$dist)
head(data2, 5)

fit.cars2 <- lm(sqrt.dist ~speed, data = data2)
fit.cars2
bptest(fit.cars2)
#0.9157
ncvTest(fit.cars2)
#0.91258
#등분산성 만족 
plot(fit.cars2)#처음 보다 덜 위험하다.
shapiro.test(fit.cars2$residuals)
#p-value = 0.3143
summary(fit.cars2)


ggplot(data = data2, aes(x= speed, y = sqrt.dist))+
  geom_point()+
  geom_abline(data = date2, intercept = 1.27705, slope = 0.32241, col= "red")

다중선형회귀분석
변수가 2개 이상일 경우
종속 변수
성적
영향을 주는 것 을 모른다.

변수를 조작한다.
변수가 여러개있다.
최소제곱법-
각각의 개수들을 찾아보는 것
예: 부동산 예측

#다중
library(readxl)
BostomHousing <- read_excel("BostonHousing.xls")
str(BostomHousing)
data <- tbl_df(BostomHousing)#데이터의 형태를 바꾼다. 이렇게 바꾸면 큰 데이터 프레임을 보기 쉽도록 해 준다.
#tbl_df -> as_tibble를 불러온다.
str(data)

View(data)
glimpse(data)

data(package ="MASS")
names(data)

library(psych)
pairs.panels(data[names(data)])#데이터 있는 것들을 이름을 가져와서 
#상관간계 보여 주는 것 
#0.70 과 -0.74
#rm lstat와 관계있다.
pairs.panels(data)
#plot에서 상관관계 
data <- data[ , -15]#오류난다.
data

names(data)
names(data) <- tolower(names(data))#소문자로 바꿔준다.
data_lm_full <- lm(medv ~ ., data = data)
#medv가격을 나타내는 주기 
data_lm_full
#이것은 작을 수록 좋다.

summary(data_lm_full)
#signif 별표시 되여있다. 별 3개 위주로 값이 작으면 좋다 . 별이 없으면 무의미한 것이여서
#R-squard는 r제곱값인데ㅔ 크면 좋다. 0.74이면 74%정도 말하면 된다.
#Adjusted R-squard 변수가 많으면 모형을 잘 설명해줄것이라고 생각한 것처럼 변수가 늘어나면 아무 의미가 없어도 변수가 널아나서 변수가 조금 변한다. 변수가 저절로 높으지지 않도록 마이나서ㅓ(penalty)를 줘서 값이 작아진다.
#Adjusted R-squard 이것 확인하면 된다.
#전체 적인 pvalue는 2e-16이다 믿을만 하다.

-----------------------------------------------------
의사결정나무 정리
redpineK 머신러닝 시리즈
의사결정 순도를 높이는 방향으로 하는 것이다.
넘 잘도면 overfitting발생 가능하다.
entropy낮추는 방향으로 -<순도를 높이는 방향으로 한다.
의사결정 시리즈로
분류하고 예측한다.
random forest
boosting

신경망에서 overfitting의 해결방법은 빼는 것이다.droupout
적당한 선에서 멈춘다.

complexity parameter
hypper parameter

의사결정나무의 장점
회이트 박스모형이며 ,결과를 해석하고 이해하기 쉽다.
자료를 가공할 필요가 거의 없다.
수치 자료와 범주 자료 모두에 적용할 수 있다.

#의사결정 나무 
#ADSP
library(rpart)

apple_df <- read.csv(file= "apple.csv")
apple_df
summary(apple_df)

str(apple_df)

#품종별 무게 
boxplot(weight ~ model , data = apple_df, ylab="무게" )

#품종별 당도
boxplot(sugar ~ model, data = apple_df, ylab ='당도')

#품종별 산도
boxplot(acid ~ model, data = apple_df, ylab ='산도')
ggplot(apple_df, aes(x = model, y = acid)) +geom_boxplot()

#최초 실행시 패키
library("ggplot2")

k <- ggplot(apple_df, aes(factor(color), fill= factor(model)))
k + geom_bar()
#geom_bar  geom_col차이점 
k1 <- ggplot(apple_df, aes(factor(color), fill= factor(model)))
k1 +  geom_col()#geom_col requires the following missing aesthetics: y

#옆으로 하는 것 position="dodge"

#iris데이터 구조
# 꽃 종류 
#꽃받침 S
#꽃잎 P
str(iris)
colnames(iris)
nrow(iris)
table(iris$Species)

library(caret)

iris_row_idx <- createDataPartition(iris$Species, p = 0.8 , list = F)#list형이 아닌 vector형으로 
#iris$Species 를 1-100 중에  80개 random하게 꺼내기 
str(iris_row_idx)
View(iris_row_idx)

iris_train_data <- iris[iris_row_idx,]
str(iris_train_data)
table(iris_train_data$Species)
iris_test_Data <- iris[-iris_row_idx,]
str(iris_test_Data)
table(iris_test_Data$Species)#3개씩 층 30건

#traing 와 테스트 각각의 종류별로도 나누었다.

summary(iris_train_data)
summary(iris_test_Data)

library(rpart)

iris_rpart_result <- rpart(Species ~ . , data = iris_train_data, control = rpart.control(minsplit =2 ))
#Species ~ . species 와 나머지 
#data = iris_train_Data 120개 traing까지 
#rpart.control(minsplit =2 )나머지 2개가 있을 때 까지

#분류분석 결과값 출력 
iris_rpart_result
#질문이 

library(rpart.plot)
rpart.plot(iris_rpart_result)

#cp값 조회
iris_rpart_result$cptable
#단계로 나눈다.

#가지치기 
iris_prune_tree <- prune(iris_rpart_result, cp= 0.0125)
#iris_rpart_result$cptable cp값으로 수정 
#cp 잴 아래것 빼고 빼는 것이 좋다.
rpart.plot(iris_prune_tree)
#마지막을 치운다. 적당하게 일반화 해줘야 한다.

#rm(list = ls())

str(iris_test_Data)
#테스트 데이터 확인 - 훈련 데이터와 컬럼명이 같아야 함 (단 종속 변수 걸럼은 없어)
predict(iris_rpart_result ,iris_test_Data, type = "class")

actual <- iris_test_Data$Species
expect <- predict(iris_rpart_result ,iris_test_Data, type = "class")
expect_prune <- predict(iris_prune_tree ,iris_test_Data, type = "class")#가지치기 하는 것도 예측해야 한다.

iris_predict_df <- data.frame(actual, expect, expect_prune)
iris_predict_df#결과값 확인 

#혼동행렬 만들기 
table(iris_predict_df)

library(caret)
confusionMatrix(expect,actual, mode ="everything" )

#------------------------

library(ggplot2movies)
movies <- ggplot2movies::movies
str(movies)

apple_train_idx <- createDataPartition(apple_df$model , p = 0.8, list = F)
nrow(apple_train_idx)
nrow(apple_df)

str(apple_df)

apple_train_df <- apple_df[apple_train_idx,]
apple_test_df <- apple_df[-apple_train_idx,]

str(apple_train_df)
str(apple_test_df)
apple_train_df
apple_test_df
table(apple_train_df$model)
table(apple_test_df$model)

apple_rpart_result <- rpart(model ~ . , data = apple_train_df, control = rpart.control(minsplit = 2))
apple_rpart_result

rpart.plot(apple_rpart_result)
str(apple_rpart_result)

apple_rpart_result$cptable
apple_pune_tree <- prune(apple_rpart_result , cp = 0.0000)
rpart.plot(apple_pune_tree)
str(apple_test_df)

app_actual <- apple_test_df$model
app_expect <- predict(apple_rpart_result, apple_test_df, type ="class")

app_predict_df <- data.frame(app_actual, app_expect)

table(app_predict_df)

#randomForest
install.packages("randomForest")
require(randomForest)
ind <- sample(2, nrow(iris), replace = T, prob = c(0.7 , 0.3))
trainData <- iris[ind == 1,]
testData <- iris[ind == 2,]

rf <- randomForest(as.factor(Species)~ . , data = trainData, ntree = 100, proximity = T, importance = T)
print(rf)

predict(rf)
table(predict(rf), trainData$Species)
irisPred <- predict(rf, newdata = testData)
irisPred
table(irisPred, testData$Species)

랜덤 포레스트(영어: random forest)는 분류, 회귀 분석 등에 사용되는 앙상블 학습 방법의 일종으로, 훈련 과정에서
구성한 다수의 결정 트리로부터 부류(분류) 또는 평균 예측치(회귀 분석)를 출력함으로써 동작한다.
Bagging은 샘플을 여러 번 뽑아 각 모델을 학습시켜 결과를 집계(Aggregating) 하는 방법입니다.
랜덤 프레스트는 배깅이 발전한 것이다.
부스팅
배깅이 일반적인 모형을 만드는데 집중되어 있다면 ,부스팅은 맞추기 어려운 문제를 맞추는 것이 목적이다.

#boosting
#파산 분류
credit <- read.csv("credit.csv")
View(credit)
install.packages("C50")
library(C50)
m_c50_bst <- C5.0(default~. , data = credit, trials = 100)
install.packages("adabag")
library(adabag)

set.seed(300)
m_adaboost <- boosting(default ~. , data = credit)#파산 하는지 의사결정 tree를 boosting해서 만든다.
p_adaboost <- predict(m_adaboost, credit)#모델가지고 예측 
head(p_adaboost$class)
p_adaboost$confusion

#set.seed(300)
#adaboost_cv <- boosting.cv(default ~ . , data = credit)
#adaboost_cv$confusion

#bagging
library(ipred)
set.seed(300)
mybag <- bagging(default ~. , data= credit, nbagg = 25) #radong data 25개 마추어서 모여봐 
predict_pred <- predict(mybag, credit)
table(predict_pred, credit$default)

'Study > R' 카테고리의 다른 글

R-10 (0)	2020.09.05
R-8 (0)	2020.09.05
R-7 (0)	2020.09.05
R-6 (0)	2020.09.05
R-5 (0)	2020.09.05

R-8

2020. 9. 5. 14:27

728x90

통계적 분석 / 머신러닝
통계분석
• 기초통계 • 가설과 검정
1. 기초통계
1. 통계학과 데이터
정의->데이터를 통하여 분석하고 분석결과를 통해 정보를 제공하는 분야
목적-> 통계학을 시작함에 앞서 통계적 목적에 따른 통계학의 분류와 분석에 사용되는
데이터의 구성 및 형태를 파악하기 위함
Focus! ->1. 통계학은 통계적 목적에 따라 어떻게 구분되는가 ?
         ->2. 데이터의 구성은 어떻게 정의하는가 ?
         ->3. 데이터의 형태는 어떻게 구분되는가 ?
1. 통계학은 통계적 목적에 따라 어떻게 구분되는가 ?
->
기술 통계학
방대한 데이터를 그래프나 표 또는 몇 개의 숫자로 요약하여 데이터의
전반적인 내용을 쉽고 빠르게 파악
가능한 방법을 다루는 통계학
추측 통계학
관심 대상의 일부를 추출하고
추출된 대상을 통해 수집된 데이터를
근거로 하 여 관심 대상 전체의
특성을 추측하고 검정하는 방법을
다루는 통계학
기술 통계학 descriptive statistics

히스토그램 ,원 그림 ,꺾은선 그림 ,상자 그림 ,최솟값, 사분위수, 평균, 최댓값 ,줄기-잎 그림
줄기-잎 그림
1,2,6
11,12,16

%문제가 크다.
선거출구조사
Focus 1
Ⅱ. 통계적 초급 이론
추측통계학
선거출구조사
전체 유권자들을 대상으로 투표 내용을 조사하는 것이 목적인 선거 출구조사에서
출구 조사원들이 유권자 전체에게 설문을 하지 않는다.
전체 유권자들을 설문하기에는 시간도 많이 소요되지만 이에 따른 비용 또한 많이
들기 때문에 현실적으로 불가능하기 때문이다.
이와 같이 관심 대상의 일부를 통해 관심 대상 전체의 결과를 추측하는 것을 의미함
변수 (variable)
데이터를 저장한 이름으로 데이터의 속성을 표현하며
NO(데이터의 넘버링), SEX(성별), AGE(나이),
SCORE(점수)와 같은 것을 의미함

수집된 변수에 따라 분석의 범위가 달라질 수 있다 ?
→ 반별로 성별에 따른 영어점수를 비교하고자 하는데
    반 변수가 없는 경우 반별 분석 불가능

개체 (observation)
서로 다른 데이터를 구분해주는 값으로 중복되지 않는
값이며, NO변수에 입력된 데이터의 넘버링 값과 같은
고유한 값을 의미함
수집된 개체 수에 따라 분석의 범위가 달라질 수 있다 ?
→ 성별 비교를 하고자 하는데 수집된 데이터가 남자 5명
    여자 100명일 경우 성별 비교 분석 불가능

관측값 (observed value)
변수에 대응되는 값으로 SEX변수에 대한 M, W,
AGE변수에 대한 23, 25, ∙∙∙, 25 등과 같은 값을 의미
함
(개체 값도 관측값이 될 수 있음)

수집된 관측값 입력 상태에 따라 결과가 달라질 수 있다
→ 점수의 평균을 파악하고자 하는데 응답자의 70%가
    응답하지 않은 경우 결과가 왜곡됨 (Not Available)

양적 데이터
- 숫자로 표현된 데이터 (= 수치형 데이터)
- 숫자나 그래프로 요약 가능한 데이터
이산형 데이터
연속형 데이터
질적 데이터
- 속성을 나타낸 데이터 (= 범주형 데이터)
- 표나 그래프로 정리 가능한 데이터
명목형 데이터
순서형 데이터
양적 데이터를 가지고 표로 정리할 경우  전달력이 떨어질 수 있다 ?
질적 데이터를 가지고 숫자로 요약할 경우  잘못된 결과가 전달 될 수 있다 ?
2. 데이터 요약
정의
데이터의 전반적인 내용을 쉽고 빠르게 파악할 수 있도록 하는 방법
목적
데이터에 내재된 정보의 특성을 왜곡하지 않고 정확하게 파악하기 위해 왜곡된 사례와
주의해야하는 데이터를 살펴보고 올바른 데이터 요약 및 표현 방법을 파악하기 위함

Focus!
1. 데이터가 나타내는 의미가 왜곡된 경우가 있는가 ?
2. 데이터의 왜곡됨을 줄이기 위해 살펴봐야 하는 데이터는 어떤 것인가 ?
3. 데이터를 숫자로 요약하고자 할 때 어떤 방법이 있는가 ?
4. 데이터를 시각적으로 표현하고자 할 때 어떤 방법이 있는가 ?

남 녀 1,2 인데 3이 있으면 오류값이다.
극단값(이상치)->  boxplot
결측치 ->is.na
범위->range
왜도 -> 기울 어지는 지
첨도 -> 뽀족하다.
평균 -> 극단치 값의 영향을 받는다.
중앙값 ->
- 데이터 크기 순으로 정렬했을 때 중앙에     위치하는 값
- 극단값들의 영향을 적게 받음
최빈값
- 데이터 중 빈도가 가장 많은 값  - 극단값에 영향을 받지 않음
분산
평균을 중심으로 데이터의 흩어진 정도를      가늠하는 값 (단위 변동 o , 제곱)
표준편차
분산의 제곱근 (단위 변동 x)
데이터에서 평균을 빼는 것 ->편차
편차 다 더하면 0이 된다.
절대값 으로 하던가 ? 아니면 편차를 제곱해서 해주는 것
제곱하면 루트 해진다.
제곱하면 다루기 힘들어서 루트를 한다.
분산
파란것이 균일하다.고르다. 0으로 점수대에 많은 것들이 모여있다.

상대도수 ->확률
원 그림 ->4~5정도의 쪼가리가 있어야 한다. 5개 이상 되면 마지막 부분은 기타로 해야 한다. 12시부터 시작하게 오른쪽 방향으로 큰것 부터
정규분
데이터의 분포 형태가 평균을 기준으로 대칭에 가깝게 따를 때 일반적으로 정규분 포라 칭함
(중심극한정리에 의하여 데이터가 충분히 클 경우 데이터의 분포는 정규분포를 따름)
표준화
평균을 0으로, 표준편차를 1로 만드는 것으로 비교하고자 하는 대상이 다른 분포를 가 지거나 다른 단위를 가질 때 상대적인 크기로 비교하기 위해 사용
표준화는 표준정규분표로 만든다.
4일차 -> 45페이지
관측치 - 평균 / 표준편차(시그마) = 표준화 값
표준화는 단위가 사라진다. 숫자만 남는다.
정규화는 1과 0 사이에 남아서 하는 것
가장 큰 값은 1 가장 작은 값은 0이다.
단점은 : 이상치가 1로 잡혀서 특수한 데이터는 정규화 하기 힘들다.
분산이 크면 표준편차가 크다.
48페이지 하나 숫자가 어디에 여도 결과는 같다.

회귀regression->예측해서 하는것
실제치와 예측치의 차이는 오차이다.
하지만 아마 예측이기때문에 잔차이라고 한다.
부모가 키가 크다 그래서 자식이 키가 얼마인지 예측 하는가 ?->
평균족으로 모여드는 경향이 있다.
설명 변수(Explanatory Variable)
종속 변수에 선행하며, 종속 변수에 영향을 줄 것으로 기대되거나 종속 변수의 변화를 예측(predict)핛 수 있다고 여겨지는 변수
한 마디로 말하면 원인으로 갂주되는 변수
독립 변수 (independent variable), 예측 변수 (predictor variable), 입력 변수 (input variable), 특징 (feature) 등 여러 이름으로 부른다.
x = 독립변수 = 열
설명 변수
예측 변수
특성(키 등 )

반응 변수

회귀분석은 선형회귀분석은 가정이 필요하다. 직선을 그을 수 있는 상황이 되여야 한다.
회귀분석의 기본가정
두 변수의 관계는 선형이다.(선형성
오차항의 확률 분포는 정규분포를 이루고 있다.(정규성)
등분산성
오차항의 평균(기대값)은 0이다.
오차항들끼리는 독립이다. 어떤 패턴을 나타내면 안 된다.(독립성)
회귀는 인과관계를 찾는 것이다.

회귀 분석은 직선을 찾는 것이다.
회귀모형
회귀분석의 목적: 종속변수 Y와 설명변수 X 사이의 관계를 선형으로 가정하고 이를 가장 잘 설명하는 회귀 계수(coefficients)를 추정하는 것
3가지의 직선이 있다.
최소제곱법(OLS; Ordinary Least Squares)
y hat(예측)과 y 의 차이를 최소화(정확하는 y축 거리의 곱)
차이를 구한다.
제곱해서 다 더해주면 된다.
수가 작으면 오차가 적다는 의미한다.
수학에서 거리는 점과 직선사이의 거리이다. edit distance
점에서 직선을 걸때 거리가 짧은 것이다.
y가 예측한것과 y의 실제값이 차이가 오차이다.

회귀는 최소제곱법으로 찾는다.
점들이 직선을 가장 설명 잘 해준다.
제곱을 하는 것은 정사각형을 그렸다.
제곱오차를 사용하면 오차가 커질수록 비용이 엄청사게 커진다.
이상치는 정사각형을 크게 그려진다. 최소제곱법은 이상치를 빼고 해야만 정확도가 높아진다.

library(dplyr)
library(ggplot2)

#-------daimonds
names(diamonds)
#carat, price
lm_dia <- lm(price ~ carat, data = diamonds)
lm_dia
plot(diamonds$carat, diamonds$price)
abline(lm_dia, col= "blue",lwd= 2)
plot(diamonds$price ~ diamonds$carat)
plot(price ~ carat, diamonds)

lm_dia$coefficients
abline(lm_dia , col = "blue", lwd = 2)

#------------cars
cars
head(cars)
str(cars)
plot(cars)
#선형회귀분석 
#cars speed 
#     dist
#lm( y ~ x)
#lm(cars$dist ~ cars$speed )
lm_car <- lm(dist ~ speed, data = cars )
lm_car
abline(lm_car , col="red")#이 데이터를 선을 거으기
abline(lm_car , col="red",lwd = 2)
#y = a+bx
ggplot(cars, aes(x=speed, y = dist))+geom_point()+geom_smooth(method = "lm")
#파난색 라인 주위에 오차가 있다. 벗어났다. 
#많이 흩어지면 정확도가 별로 높지 않다.
ggplot(cars, aes(x=speed, y = dist))+geom_point()+geom_smooth(method = "lm")+theme_dark()#배경 

lm_car
#선형 예측을 하는데 크다.
#Call:
#  lm(formula = dist ~ speed, data = cars)

#Coefficients:
#  (Intercept)        speed  
#-17.579->distance        3.932->기울기

cars$speed
str(cars)
cars$speed
predict(lm_car, data.frame(speed = 30))#예측 100.3932

lm_car 
names(lm_car)

y = -17.579 + 3.392 * 30
y
  100.3932- 84.181
predict(lm_car, data.frame(speed = c(30,35,40)))#예측 100.3932
summary(lm_car)
lm_car$coefficients
a <- lm_car
a$coefficients
names(a)
#[1] "coefficients"  "residuals"     "effects"       "rank"          "fitted.values" "assign"       
#[7] "qr"            "df.residual"   "xlevels"       "call"          "terms"         "model"   
a$residuals#잔차 오차
a$effects
a$rank
a$fitted.values
a$assign
a$qr
a$df.residual#잔차이다. 정확하지 않아서 오차라고 할 수 없다.
a$xlevels
a$call
a$terms
a$model

predict()
#이쪽에 많는 것을 하고 있다.
#predict.glm, predict.lm, predict.loess, predict.nls, predict.poly, predict.princomp, predict.smooth.spline.
#원래는 지정해야 하는데 lm을 사용해서 했기 때문에 
#예측해서 알아서 해준다.plot과 같다.
#predict.lm
## S3 method for class 'lm'
##predict(object, newdata, se.fit = FALSE, scale = NULL, df = Inf,
##        interval = c("none", "confidence", "prediction"),
##        level = 0.95, type = c("response", "terms"),
##        terms = NULL, na.action = na.pass,
##        pred.var = res.var/weights, weights = 1, ...)
#object는 원래것 해야 하고 
#newdata 새로운 data.frame()으로 해야 한다.
c <- data.frame(speed = c(122))
predict(lm_car, newdata = c)

new <- data.frame(speed = c(122))
predict(lm_car, newdata = new , interval = "confidence")


new <- data.frame(speed = c(122,125,130,133))
predict(lm_car, newdata = new , interval = "confidence")

new <- data.frame(speed = c(122))
predict(lm_car, newdata = new , interval = "confidence")

new <- data.frame(speed = c(122))
predict(lm_car, newdata = new , interval = "confidence", level = 0.9)#위에 것에 비해서 폭이 더 넓어졌다.


fitted.values

f1 <- function(x){
  y = -17.579095+3.932409
  return(y)
}

상간관계

e->숫자 10이다.
1.49e-12 10의 지수로 -12제곱
1.49* (10의 -12승) 아주 작은 숫자이다.

귀무가설을 기각할 활귤->영가설->제로 가설
귀무(guiwu) ->
예:한국인의 평균키는 170이다. 정확하는지 판단한다. 170이 맞다고 하는 편하고 170이 아닌 편이라고 한다.
2가지 가설이 있다.
귀무가설 한쪽이 , 다른 한쪽이 대립가설이라고 한다.
어느 한쪽이 정해지면 170을 귀무가설이라고 한다.
100명중 95명이 신뢰구간에서 뽑혀졌다.
유의수준은 5%정도이다.
귀무가설은 95%이다.
중간에 겹치는 부분이 있다. 왼쪽에는 점은 왼쪽이고
오른쪽에 있는 점은 오른쪽의 집단이라고 한다.
5%보다 적으면 오른쪽으로 정해진다.
유의수준 영역에 해당하는 경우가 대립가설을 충족하는 경우를 의미

0에 가까울수록 중요핚(통계적으로 유의미핚) 변수임을 의미핚다.
R-제곱 값: 직선이 데이터를 설명하는 정도, 상관계수와 관렦된 값
R-제곱 값의 정확핚 의미는? 그리고 구하는 방법은?

모형의 적합도 평가

오차가 적어야 잘 설명했다.
평균이 옆으로 직선
회귀선을 그린다.

편차 -산술 평균, 각각의 값

상간관계가 0이면 아무관계가 없엇 1.49e-12를 하는 것은 대립관계를 설명 하는 것이다.

오차 = 모집단의 회귀식에서 예측된 값 - 실제 관측값

잔차 = 표본집단의 회귀식에서 예측된 값 - 실제 관측값

오차는 예측하기 위하여 추정된 값과 실제값의 차이 ,즉 예측값이 정확하지 못한 정도->완전히 벗어난 것
잔차: 평균이 아니라 회귀식 등으로 추정된 값과의 차이 ,즉 추정된 값이 설명할 수 없어서
아직도 남아 있는 편차 ,잔차는 편차의 일부분

---------------------------오후
베이지안 bayesian
빈도확률->동전의 던져서 몇번 나오는지 ?->시행횟수를 반복하여 빈도수를 측정하게 되면 ,우리에게 익숙한 빈도 확률을 계산할 수 있따.
베이지안 확률->화산이 폭발할 확률을 빈도 확률로 계산할 수 있을가 ?세상에는 반복할 수 없는 사건이 무수히 많고 , '빈도 확률'의 개념을
그러한 사건에 적용할 수 없다.일어나지 않은 일에 대한 확률을 불확실성의 개념. 즉 ,사건과 관련 있는 어려 확률을 이용해 새롭게 일어날
사건을 추정하는 것이 베이지안 확률이다.
사전확률 p(A)  과 우도확률 p(B|A)를 안다면 사후확률 p(A|B)를 알 수있다는 것이다.

1) 18세기 영국의 수학자 토마스 베이즈(Thomas Bayes)가 도입.
2) 사전확률(Prior Probability)  : 추가적 정보가 주워지기전의 정보
3) 사후확률(Posterior Probability) : 추가적 정보가 주워진 상태에서의 사전확률 조건부확률과 같음
4) 사전확륙과 사후확률을 알고 있다면 우도확률(Likehood Probability)을 구할 수 있다.

https://brunch.co.kr/@chris-song/59

베이지안 추론 - 1편

설령 믿어지지 않는다고 해도, 가능성을 제외하고 남는 게 진짜 범인이다. | 셜록 홈즈의 베이지안 추론 베이지안 추론에 적합한 짤방으로 베이지안 추론에 대한 이야기를 시작하겠습니다. 명탐

brunch.co.kr

새로운 확률을 사전활류 가지고 업데이터한다.
예: 100명중에 남자50명 ,여자 50명 ,
50명중 긴머리 여자 25명 짧은 머리 여자 25명
50명중 짧은 머리 남자 48명 긴머리 남자 2명

사전확률: 현재상황에서 판단할 수 있는 것
사후확률: 새로운 증거에서 알아내고 계산하는 방법
사전확률에 따라 사후활률이 달라진다.

navi 사건에 따라 증거가 많이 나온다. 새로운 증거들이 독립적인지 확신을 못한다. 독립적이야만 이 베이직안 에 맞다.
독립적인지 확인못한다.
naive하게
나이브 개발사건마다 독립적이지 않는 것을 확인 시켰다. 잘 맞는다.
메일에서 스팸의 반대 햄
비아그라는 사전에 이미 알고 있는 것
업데이트 하면서 정상메일인지 스팸메일인지 한다.

http://rpubs.com/qa6273/HW4_1

RPubs - RSanford HW4 - #1

rpubs.com:443

#-------------------베이지안
library(readr)
sms_raw = read.csv("sms_spam.csv",stringsAsFactors = F)
#sms_raw = read_csv("sms_spam.csv")
View(sms_raw)
str(sms_raw)
head(sms_raw)

sms_raw$type <- factor(sms_raw$type)#범주형으로 바꾸기 
str(sms_raw)
table(sms_raw$type)#빈도수 확인
write.csv(sms_raw,"sms_raw.csv")

install.packages("tm")#build a corpus using the  text mining(tm)package
library(tm)

maintainer("tm")#만들은 분
citation("tm")# 정보 
?tm
vignette("tm")#간단한 예제를 보여준다.
#Corpus 말뭉치다.
#VCorpus( volatile Corpus)->메모리 휘발성 말뭉치
#PCorpus( permant COrpus) 영구저장 말뭉치
sms_corpus <- VCorpus(VectorSource(sms_raw$text))#뭉친다.
sms_corpus
#VCorpus : 문서를 Corpus class로 만들어 주는 함수로써 결과는 메모리에 저장되어 현재 그덩 중인 R object에만 영향을 끼침 
#PCorpus : VCorpus와 달리 문서를 Corpus로 만들어 database화 해주는 함수

#examine the sms corpus
print(sms_corpus)#구조 찍어보기 VCorpus되여졌음 메모리에 올라가서 나름 구조화 되여있다.
inspect(sms_corpus[1:2])#요약해 보겠다.

#check corpus message
as.character(sms_corpus[[1]])#cha로 바꾼다.
lapply(sms_corpus[1:2], as.character)#모든데 열에 다 적용해라 

#for (i in 1:length(sms_corpus)){
#  Encoding(corpus[[i]])="UTF-8"
#}
#sms_corpus_clean <- tm_map(sms_corpus, tolower)
sms_corpus_clean <- tm_map(sms_corpus, content_transformer(tolower))#소문자로 바꾸서 clean해라



as.character(sms_corpus[[1]])
as.character(sms_corpus_clean[[1]])

sms_corpus_clean <- tm_map(sms_corpus_clean, removeNumbers)#숫자 빼기
sms_corpus_clean <- tm_map(sms_corpus_clean, removeWords, stopwords()) #stop words 무의미한 단어 빼기
sms_corpus_clean <- tm_map(sms_corpus_clean, removePunctuation) # remove punctuation
removePunctuation("hello...world")
replacePunctuation <- function(x){gsub("[[:punct:]]+"," ",x)}
replacePunctuation("hello..world")

install.packages("SnowballC")
library(SnowballC)
wordStem(c("learn","learned","learning","learns"))

sms_corpus_clean <- tm_map(sms_corpus_clean,stemDocument)#변형된것 치우고 나머지 지우기 단어의 변형들을 포함되는 것만 빼기 
sms_corpus_clean <- tm_map(sms_corpus_clean,stripWhitespace)

#아래 두개 비교해본다.
lapply(sms_corpus[1:3], as.character)
lapply(sms_corpus_clean[1:3],as.character)

#단어를 해아려서 몇개 몇개 하는 것이다.
sms_dtm <- DocumentTermMatrix(sms_corpus_clean)
class(sms_dtm)
dim(sms_dtm)
colnames(sms_dtm)
#write.csv(sms_dtm,file="sms_dtm.csv")
#install.packages("tidytext")
#DocumentTermMatrix 한 내용을 보여주는 부분
# library(tidytext)
# as_td <- tidy(sms_dtm)
# View(as_td)
#inspect(sms_dtm)
#as.matrix(sms_dtm)
#View(sms_dtm)

sms_dtm2 <- DocumentTermMatrix(sms_corpus,control = list(
  tolower = TRUE,
  removeNumbers = T,
  stopwords= T,
  removePunctuation = T,
  stemming =T
))

sms_dtm3 <- DocumentTermMatrix(sms_corpus,control = list(
  tolower = TRUE,
  removeNumbers = T,
  stopwords= function(x){removeWords(x,stopwords())},
  removePunctuation = T,
  stemming =T
))
sms_dtm2
sms_dtm3

sms_dtm_train <- sms_dtm[1:4169,]#학습문제 
sms_dtm_test <- sms_dtm[4170:5559,]#시험문제

sms_train_lables <- sms_raw[1:4169,]$type #답 여주는 것
sms_test_lables <- sms_raw[4170:5559,]$type

prop.table(table(sms_train_lables))#범위를 계산하는 것 spam is similar
prop.table(table(sms_test_lables))

install.packages("wordcloud")
library(wordcloud)
wordcloud(sms_corpus_clean,min.freq = 50, random.order = F)
#스팸으로 해서 
spam <- subset(sms_raw, type=="spam")
ham <- subset(sms_raw, type=="ham")

wordcloud(spam$text, max.words = 40, scale = c(3,0.5))
wordcloud(ham$text, max.words = 40, scale = c(3,0.5))

sms_dtm_freq_train <- removeSparseTerms(sms_dtm_train,0.999)
sms_dtm_freq_train

findFreqTerms(sms_dtm_train,5)#5번이상 나오는 것을 여기 

save_freq_words <- findFreqTerms(sms_dtm_train,5)
str(save_freq_words)

sms_dtm_freq_train <- sms_dtm_train[,save_freq_words]
sms_dtm_freq_test <- sms_dtm_test[,save_freq_words]

convert_counts <- function(x){
  x <- ifelse(x>0,"Yes","No")
}

sms_train <- apply(sms_dtm_freq_train,MARGIN = 2, convert_counts)
sms_test <- apply(sms_dtm_freq_test,MARGIN = 2, convert_counts)

#Trainging a model on the data
install.packages("e1071")
library(e1071)
#naive로 나이브베이지안으로 스팸인지 아닌지 확인한다.
sms_clasifier <- naiveBayes(sms_train,sms_train_lables)

#evaluating model perfrmance
sms_test_pred <- predict(sms_clasifier, sms_test)

install.packages("gmodels")
library(gmodels)
CrossTable(sms_test_pred, sms_test_lables, prop.chisq = F, prop.t =  F, prop.r =  F, dnn= c('predict','actual'))

sms_clasifier2 <- naiveBayes(sms_train, sms_train_lables, laplace = 1)
sms_test_pred2 <- predict(sms_clasifier2, sms_test)
CrossTable(sms_test_pred2, sms_test_lables, prop.chisq = F, prop.t =  F, prop.r =  F, dnn= c('predict','actual'))

'Study > R' 카테고리의 다른 글

R-10 (0)	2020.09.05
R-9 (0)	2020.09.05
R-7 (0)	2020.09.05
R-6 (0)	2020.09.05
R-5 (0)	2020.09.05

R-7

2020. 9. 5. 14:20

728x90

#구글 스프레드시트 ->개인에서 누러기->내용추가->파일->업로드->
잴위에 x라는 열이름이 생긴다.

open map
install.packages("ggmap")
library(ggmap)
library(ggplot2)
library(dplyr)
library(rvest)#read_html

html.airports <- read_html("https://en.wikipedia.org/wiki/List_of_busiest_airports_by_passenger_traffic")
html.airports
df <- html_table(html_nodes(html.airports, "table")[[1]],fill= T) #[[]]list중에서 첫번째것 
#html_table 표 가지로 가는 것 
#html_nodes 노드 가지로 가는 것 

head(df)
head(df$Airport)
colnames(df)
colnames(df)[6] <- "total" #있으면 보여주고 없으면 만들어준다.
df$total
df$total <- gsub(',','',df$total)#정규표현식 할떄 나오는 것 g는 global하는 것 
#전부다 대체 하는 것이다. ,를 찾아서 ''으로 대체 한다.
df$total <- as.numeric(df$total)#character->숫자로 바꾸는 것 
df$total
#df$total <- as.double(df$total,5)
#df$total
typeof(df$total)

df <- head(df,10)
df
write.csv(df$Airport,file="geoname.csv")#공항이름으로 구글에 들어가는 방법을 진행하고 있습니다.
#geoname.csv읽어오면 캐리터는 글자로 일지 않고 글자는 범주로 읽어온다.

숫자를 글자로 인식하고 ->범주로 인식한다.
string ->factor로 받아들인다.
그래서 factor->as.character->numeric으로 바꾼다.

#구글 스프레드시트 ->개인에서 누러기->내용추가
geo <- read.csv("geoname - geoname.csv",stringsAsFactors = F)#문자를 factor로 하는 것이 아니라 chr로 읽어 온다.
geo
geo <- na.omit(geo) 
str(geo)

#아래 방법을 사용하는게 좋다.
library(readr)
geo <- read_csv("geoname - geoname.csv")
str(geo)

getwd()

#구글 스프레드시트 ->개인에서 누러기->내용추가
geo <- read.csv("geoname - geoname.csv")
geo
geo <- na.omit(geo) # geo <- geo[-1,]
geo

df <- cbind(df, lat = geo$Latitude, lot = geo$Longitude)
install.packages("maps")
library(maps)
world <- map_data("world")
world
#install.packages("reprex")
#library(reprex)

str(df)
dplyr::glimpse(df)
str(df)
#소수점 빼기 -1
df$lot <- as.numeric(as.character(df$lot))# Factor범주형이기때문에 
df$lat <- as.numeric(as.character(df$lat))
#소수점 빼기 -2 #안됨 숫자만 가능함 
#round(df$lat, digits = 5) 
df

as.numeric(df$lot)#1,2,3,4,5 이런식으로 나온다.

df$lot
as.double(df$lot,-5)
head(world,6)
head(df)
format(round(df$lot, 5), nsmall = 5)
str(world)

ggplot(df, aes(x = lot, y = lat))+
  geom_polygon(data = world, aes(x = long, y =lat, group = group ), fill="grey75", color="grey70")+
  geom_point(data = df, color = rainbow(10), alpha = .25, aes(size= total))+
  geom_point(data = df, color = rainbow(10), alpha = .75, shape = 1, aes(size = total))+
  geom_segment(data = df, aes(x =lot, y = lat,xend = lot, yend = lat, colour = Airport ))

numeric->하면 아래 너무 많이 짜른다.

#twitter
----------------------------twitee----------
load("Obama.rda")
load("Trump.rda")

save(tweets_Obama,file="Obama.rda")
save(tweets_Trump,file="Trump.rda")
tweets_Obama
tweets_Trump
View(tweets_Trump)
library(twitteR)

#tweets_Trump <- searchTwitter("Obama",lang="en",n =1000)
tweets_Obama <- twListToDF(tweets_Obama)#dataframe으로 바꿔주는 
Obama_text <- tweets_Obama$text#트위트 내용만 가져오기
Obama_text
head(Obama_text)
names(tweets_Obama)

tweets_Obama$text#이 열만 가져와서 하기 

Obama_text <- gsub("\\W"," ",Obama_text)#빈칸으로 바꿨습니다.W대문자 특수문자 바꾸는 것 
Obama_df <- as.data.frame(Obama_text)#다루기 편한 ggplot할때 는 dataframe의 형태로 바꿨다.
length(Obama_df)
length(Obama_text)

#Trump
#tweets_Trump <- searchTwitter("Trump,lang="en",n =1000)
tweets_Trump <- twListToDF(tweets_Trump)
Trump_text <- tweets_Trump$text
head(Trump_text)
names(tweets_Trump)

tweets_Trump$text

Trump_text <- gsum("\\W"," ", Trump_text)
Trump_df <- as.data.frame(Trump_text)

#단위를 뽑아서 사전을 만들었다.
#긍정적이면 3점 아니면 2점 
#hu.liu.pos = readLines('https://www.dropbox.com/sh/3xctszdxx4n00xq/AAA_Go_Y3kJxQACFaVBem__ea/positive-words.txt?dl=1');
#hu.liu.neg = readLines('https://www.dropbox.com/sh/3xctszdxx4n00xq/AABTGWHitlRZcddq1pPXOSqca/negative-words.txt?dl=1');


#1. url에서 가져오는 것 
hu.liu.pos = readLines('https://www.dropbox.com/sh/3xctszdxx4n00xq/AAA_Go_Y3kJxQACFaVBem__ea/positive-words.txt?dl=1');
hu.liu.neg = readLines('https://www.dropbox.com/sh/3xctszdxx4n00xq/AABTGWHitlRZcddq1pPXOSqca/negative-words.txt?dl=1');

sample=c("You're awesome and I love you","I hate and hate and hate. So angry. Die!","Impressed and amazed: you are peerless in your achievement of unparalleled mediocrity.")
result=score.sentiment(sample,hu.liu.pos,hu.liu.neg)
result

#2. 파일로 가져오는 것 
pos.word <- scan("positive-words.txt",what ="character",comment.char = ";")#scan으로 이파일을 읽는다.
neg.word <- scan("negative-words.txt",what ="character",comment.char = ";")
pos.word
neg.word

score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
  require(plyr);
  require(stringr);
  scores = laply(sentences, function(sentence, pos.words, neg.words) {
    sentence = gsub('[^A-z ]','', sentence)
    sentence = tolower(sentence);
    word.list = str_split(sentence, '\\s+');
    words = unlist(word.list);
    pos.matches = match(words, pos.words);
    neg.matches = match(words, neg.words);
    pos.matches = !is.na(pos.matches);
    neg.matches = !is.na(neg.matches);
    score = sum(pos.matches) - sum(neg.matches);
    return(score);
  }, pos.words, neg.words, .progress=.progress );
  scores.df = data.frame(score=scores, text=sentences);
  return(scores.df);
}

Obama_scores <- score.sentiment(Obama_text, pos.word, neg.word, .progress = "text")
str(Obama_scores)
Obama_scores$score
hist(Obama_scores$score)

Trump_scores <- score.sentiment(Trump_text,pos.word, neg.word, .progress = "text")
Trump_scores$score
hist(Trump_scores$score)

#margin설정
#par(mar=c(2,4,2,2))#여백주는 함수

a <- dim(Obama_scores)[1]
b <- dim(Trump_scores)[1]

alls <- rbind(as.data.frame(cbind(type= rep("Obama",a), score = Obama_scores[,1])),
             as.data.frame(cbind(type= rep("Trupm",a), score = Trump_scores[,1])))
str(alls)
alls$score <- strtoi(alls$score)

library(ggplot2)
ggplot(alls, aes(x = score, color = type))+geom_density(size = 1)


nrow(Obama_scores)
nrow(Trump_scores)

#rbind #row로 묶어주는것 type열과 score이라는 열을 묶어준다.
#Obama 를 a= 1000번 복제하세요  score이라는 열을 잡아서 Obama_scores첫번째 열을 꺼내서 하는 것 
#str(Obama_scores)
#text가 범주형 
library(dplyr)
Obama_scores[,1]
str(alls)
distinct(alls$type)
str(as.character(alls$type))
n_distinct(as.character(alls$type))
distinct(as.character(alls$type))#오류 난다. 클래스 "character"의 객체에 적용된 'distinct_'에 사용할수 있는 메소드가 없습니다

4. 동적 시각화
“plotly”는 Interactive 그래프를 그려주는 패키지 ggplot2로 완성된 그래프를 단순히 ggplotly( ) 에 넣으면 된다.
https://plot.ly/r/

rCharts 는 자바스크립트 기반인 D3, Highcharts 를 R에서 그려주는 패키지.
require(devtools) ; install_github('rCharts', 'ramnathv')
googleVis google 에서 만든 Interactive 그래프를 그려주는 라이브러리
https://plot.ly/->dash->dash

https://plot.ly/r/

Plotly R Graphing Library

Plotly's R graphing library makes interactive, publication-quality graphs. Examples of how to make line plots, scatter plots, area charts, bar charts, error bars, box plots, histograms, heatmaps, subplots, multiple-axes, and 3D (WebGL based) charts.

plotly.com

#-------------------------plotly-----------------

mpg
ggplot(data = mpg, aes(x = displ , y = cty, color = drv))+
  geom_point()+xlim(3,5)+ylim(10,25)#2차원 산점도
   
ggplot(data = mpg, aes(x = displ , y = cty, color = drv))+
  geom_point()

install.packages("plotly")
library(plotly)
p1 <- ggplot(data = mpg, aes(x = displ , y = cty, color = drv))+
  geom_point()
ggplotly(p1)

p2 <- ggplot(mpg, aes(x= class, fill= drv))+geom_bar()
ggplotly(p2)

p2 <- ggplot(mpg, aes(x= class, fill= drv))+geom_bar(position="dodge")#bar를 옆으로 펼쳐지는 것 position="dodge"
ggplotly(p2)

#google
#rCharts
install.packages("devtools")
library(devtools)

library(ggplot2)
install_github('ramnathv/rCharts')
install_github('saurfang/rCharts',ref= 'utf8-writelines')
library(rCharts)
hair_eye_male <- subset(as.data.frame(HairEyeColor),Sex =="Male")
nPlot(Freq ~ Hair , group ="Eye" ,data = hair_eye_male, type ="multiBarChart")
rPlot(mpg ~ wt | am + vs, data = mtcars, type ="point", color ="gear")#4개로 쪼개서 보기 좋게 보여준다.


mp3 <- Leaflet$new()
map3$setView(c(51.505, -0.09), zoom = 13)
map3$marker(c(51.5, -0.09),bindPopup ="<p> I am a popup </p>")#영국 
map3$marker(c(51.495, -0.083),bindPopup="<p> Hi~ </p>")
library(dplyr)
mp3 <- Leaflet$new()
mp3
map3$setView(c(37.532600, 127.024612), zoom = 13)
map3$marker(c(37.5, 127.0),bindPopup ="<p> I am a popup </p>")#영국 
map3$marker(c(51.495, -0.083),bindPopup="<p> Hi~ </p>")

m1 <- mPlot(x ="date" , y =c("psavert","uempmed"), type ="Line", data =economics)
m1$set(pointSize = 0, linewidth = 1)
m1


map3 <- Leaflet$new() 
map3$setView(c(51.505, -0.09), zoom = 13) 
map3$marker(c(51.5, -0.09), bindPopup = "<p> I am a popup </p>") 
map3$marker(c(51.495, -0.083), bindPopup = "<p> Hi~ </p>") 
map3

https://www.youtube.com/results?search_query=cs231n
https://www.youtube.com/results?search_query=cs230
https://www.youtube.com/results?search_query=3blue1brown

Google Map Api 활용한 지도 시각화
ggmap패키지2.7 버전설치하고인증키등록하기
get_map() 함수
◉ location
- 불러올 지역의 중심을 ‘지역명’, ‘주소’, 또는 c(lon = 경도, lat = 위도) 처럼 위경도 좌표로 지정
- Windows 사용자들은 ‘지역명’이나 ‘주소’를 한글로 지정하려면 반드시 enc2utf8() 함수를 이용하여 한글의 인코딩 방식을 UTF-8으로 변경
◉ zoom
- 2 ~ 21 사이의 정수를 입력. 숫자가 커질수록 확대된 지도가 출력. 예를 들어 3은 대륙(Continent), 10은 도시(City), 21은 건물 (Building) 크기
get_map() 함수
◉ maptype
- ‘terrain’, ‘terrain-background’, ‘satellite’, ‘roadmap’, ‘hybrid’, ‘toner’, ‘watercolor’ 중에서 하나를 지정
◉ source
- ‘google’, ‘osm’, ‘stamen’, ‘cloudmade’ 중 하나를 선택
get_map
myLocation->위성으로
#leaflet은 인터랙티브한 지도를 구현할 수 있게 해주는 오픈소스 자바스크립트 라이브러리이다
#R의 leaflet 패키지는 R에서 데이터를 가지고 쉽게 Leaflet 지도를 생성할 수 있게 해준 다

install.packages("leaflet")
library(leaflet)
library(dplyr)
library(readxl)
library(stringx)

#http://www.weather.go.kr 접속한다.

1. [지진ㆍ화산] → 2. [국내지진 목록]
1. 날짜 지정(2009.01.01~2019.12.03) → 2. [검색하기] → 3. [엑셀 다운로드]
#다운받은 파일을 엑셀을 실행시켜 sheet가 몇 개인지 확인한다
#워킹디렉토리에 넣은 후 불러오기한다

eq <- read_xlsx('국내지진.xlsx')
View(eq)
# 북한을 제외한다
eq01 <- eq[grep("북한",eq$위치,invert = T),]
#eq01 <- eq[grep("북한",eq$위치,invert = F),]
#정규표현식 grep ("단어", 위치 , )
#invert = T북한행을 빼고 보여주는 것 
#invert = F북한행만 보여주는 것 
eq[grep("북한",eq$위치,invert = T),]
eq[grep("북한",eq$위치,invert = F),]
View(eq01)
# 위도, 경도 뒤의 불필요한 문자를 삭제한 후 숫자형으로 바꾸어 준다 
eq01$위도 <- str_replace(eq01$위도 , ' N','')#불필요한 부분 빼야 한다. 공백하고 N을 아무것도 없이 바꾼다.
eq01$위도 <- as.numeric(eq01$위도)#character을 numeric으로 바꾼다.
eq01$경도 <- str_replace(eq01$경도,' E','')
eq01$경도 <- as.numeric(eq01$경도)

str(eq01)

leaflet() %>% addTiles() %>% addMarkers(lng=eq01$경도, lat=eq01$위도, popup=paste0("발생시각 : ", eq01$발생시각, "<br/>위치: ", eq01$위치, "<br/>규모 : ", eq01$규모))

Kormaps2014 package
devtools::install_github("cardiomoon/kormaps2014") 
devtools::install_github("cardiomoon/moonBook2")
library(kormaps2014) 
library(moonBook2)

# http://www.localdata.kr 접속한다
# 1. 식품 → 2. 음식점 아이콘 아래의 [바로가기]
# 창 아래쪽으로 이동하여 [EXCEL] 클릭하여 파일을 다운로드한다

Tips
# 다운받은 파일을 열어보고 자료의 속성과 구조를 파악한다
# 운영체제나 프로그램에 따라 한글인코딩 방식이 달라지므로 항상 유의한다(csv, txt 파일)
# 객체를 생성할 때 가급적 간단하게 식별가능하게 정의 한다
# 기존의 성공한 데이터를 이용하여 가공하도록 한다
# 항상 데이터의 속성과 구조에 유의하여 가공을 하도록 한다
# 데이터 가공할 때에 효율적으로 사용해야 한다(시간과 노력의 비용 염두)
# 다양한 관점과 시각으로 접근하여 효율성을 높인다
# 필요한 패키지가 설치되고 로드되어 있는지 확인한다
엑셀은 sheet확인하기
2008년도 관악구 정보 확인해야 한다.

https://rpubs.com/cardiomoon/222145

devtools::install_github("cardiomoon/kormaps2014") 
devtools::install_github("cardiomoon/moonBook2")
library(kormaps2014) 
library(moonBook2)
areacode
korpop2 <- kormaps2014::korpop2
kormap1 <- kormaps2014::kormap1

korpop2
str(kormap1)#시 도 
str(kormap2)#구단위
str(kormap3)#동단위
str(korpop1)#invalid multibyte string, element 3
str(korpop2)#invalid multibyte string, element 3
str(korpop3)#invalid multibyte string, element 3
korpop2 <- changeCode(korpop2)#
korpop1 <- changeCode(korpop1)

library(ggplot2)
install.packages("mapproj")
library(mapproj)
library(dplyr)
theme_set(theme_gray(base_family = "NanumGothic"))
#https://wikidocs.net/41145
#2015년 인구총조사 중 “총인구_명”으로 단계구분도를 그리려면 다음과 같이 합니다.
kormap1
kormap1$lat
kormap1$long
str(kormap1)
ggplot(korpop1,aes(map_id= code, fill =총인구_명))+
  geom_map(map = kormap1, colour="black",size=0.1)+
  expand_limits(x= kormap1$long,y = kormap1$lat)+
  scale_fill_gradientn(colours = c('white','orange','red'))+
  ggtitle('2015년도 시도별 인구분포도')+coord_map()

#moonBook2패키지에 있는 ggChoropleth()함수를 이용하면
#훨씬 간편하게 단계구분도를 그릴 수 있습니다.
#다음은 시군구별, 읍면동별 단계구분도의 예제입니다.
#korpop2->kormap2로 하면서 남자_명으로 한다.
ggChoropleth(korpop2,kormap2,fillvar = "남자_명")

kormap3 <- kormaps2014::kormap3
korpop3 <- kormaps2014::korpop3
str(kormap3)
str(korpop3)
korpop3 <- changeCode(korpop3)
ggChoropleth(korpop3,kormap3,fillvar="주택_계_호")

area2code
str(korpop3)
str(kormap3)

areacode <- changeCode(areacode)
str(areacode)
ggChoropleth(korpop3,kormap3,fillvar="총인구_명",subarea=c("전라","광주"))

rm(list = ls())

remove.packages(c("ggiraphExtra", "tibble", "ggplot2"))
install.packages(c("ggiraphExtra", "tibble", "ggplot2"))
install.packages("devtools")
remove.packages(c("moonBook2"))
remove.packages(c("kormaps2014"))
library(kormaps2014) 
library(moonBook2)

ggChoropleth(korpop2,kormap2,fillvar="남자_명",interactive=TRUE)
ggChoropleth(korpop3, kormap3, fillvar = "남자_명",interactive= T,subarea=c("전라","광주"),tooltip="행정구역별_읍면동")

summary(tbc)
tbc1= tbc[tbc$year %in% c(2001,2005,2010,2015),]
ggChoropleth(tbc1,kormap1,fillvar="NewPts",facetvar="year",tooltip="name",interactive=T)

areacode

# 1. 식품 → 2. 음식점 아이콘 아래의 [바로가기]
# 창 아래쪽으로 이동하여 [EXCEL] 클릭하여 파일을 다운로드한다
install.packages('rlang') library(rlang) library(readxl) library(dplyr)
# 서울특별시 일반음식점 엑셀파일을 불러오기한다. 
seoulres<- read_excel("6110000_서울특별시_07_24_04_P_일반음식점.xlsx") View(seoulres)

# 엑셀프로그램으로 해당 파일을 열어 시트가 여러개인지 확인해야 한다
# 시트가 여러개일때 따로 따로 불러서 데이터를 합쳐야 한다 # 다시 시트별로 각각 불러 모은다 
seoulres01<- read_excel("6110000_서울특별시_07_24_04_P_일반음식점.xlsx", sheet = '일반음식점_1') 
seoulres02<- read_excel("6110000_서울특별시_07_24_04_P_일반음식점.xlsx", sheet = '일반음식점_2’)
# 두개의 데이터를 병합한다 seoulres03 <- rbind(seoulres01, seoulres02)
View(seoulres03)

str(seoulres03)

# 필요한 부분을 축출한다 
seoulch <- seoulres03 %>% filter(상세영업상태명 == '영업' & 위생업태명 %in% c('호프/통닭','통닭(치킨)'))
View(seoulch)

#열을 뽑는다.
# 어떤 변수를 활용할 것인가? 결측치가 적은 것을 택하자 
sum(is.na(seoulch$소재지우편번호))
sum(is.na(seoulch$소재지전체주소))

#1. 소재지전체주소 변수를 선택할 경우 해당변수의 결측치 제거 
seoulch01 <- na.omit(seoulch$소재지전체주소)
View(seoulch01)

# 문자열 함수 패키지를 로드한다. 
library(stringr)

# 정규식을 사용해서 '소재지전체주소'에서 '구명'만 추출한다. 
seoulch02 <- substr(seoulch01, 7, 10)#서울특별시 구로구 
#7->10자리 까지 뽑는다.
# 공백제거seoulch03 <- gsub(" ", "", seoulch02) 
seoulch02
seoulch03 <- gsub(" ","",seoulch02)#공백을 없앤다.
View(seoulch03)
unique(seoulch03)

# 필요없는 문자 제거 
seoulch04 <- str_replace_all( seoulch03, "[을충신필명중오인남다수북흥회쌍입광정저순만황초서장태주무산]","")
View(seoulch04)
str(seoulch03)
names(seoulch03)

#중구행만을 따로 떼고 가공한 후 다시 새로 붙여보자 
# 중구행만을 축출하자
seoulchch <- seoulch03[276:616]
View(seoulchch)
unique(seoulchch)

# 중구 뒤에 나오는 글자를 삭제하자 
seoulchch01 <- substr(seoulchch, 1,2)
View(seoulchch01)
unique(seoulchch01)

# 중구행을 삭제하여 새로 가공된 중구행을 결합할 준비를 하자
seoulch05 <- seoulch03[-(276:616)]
View(seoulch05)
unique(seoulch05)

# 가공된 중구행을 붙인다 
seoulch06 <- c(seoulchch01,seoulch05)
View(seoulch06)
unique(seoulch06)

# 구별로 빈도수를 구한다 
seoulch06 <- table(seoulch06)
View(seoulch06)
seoulch06
nrow(seoulch06)

# 2. 소재지우편번호(구우편번호)를 이용하는 방법 
seoulch07 <- na.omit(seoulch$소재지우편번호) 
View(seoulch07)

# 우편번호 앞 세자리를 추출한다 ->구로 구분한다.
seoulch08 <- substr(seoulch07, 1,3) 
View(seoulch08)

# 구별로 합산한다 
seoulch09 <- table(seoulch08)
View(seoulch09)
unique(seoulch08)

# 구명 객체를 생성하여 데이터를 만들 준비를 하자 
# 공통 키로 해서 
gu<- c("중구", "종로구", "서대문구", "마포구", "은평구", "동대문구", "중랑구", "도봉구","성동구", "강동구", "강남구", "성북구", "서초구", "송파구", "노원구", "용산구", "강북구", "광진구", "영등포구", "관악구", "구로구", "금천구", "동작구", "강서구", "양천구")

# 구명 객체와 병합한다 
seoulch10 <- data.frame(gu, seoulch09) 
View(seoulch10)

# 트리맵 시각화 
install.packages("treemap")
library(treemap)
treemap(seoulch10, index = "gu", vSize ="Freq", title = "서울시 구별 치킨집수")
#많은 것 부터 

# 패키지에 내장되어있는 기존의 데이터를 가공한다 
ch <- korpop2[1:25,] #1행부터 25까지 행을 보여준다.
ch 

# 필요한 변수를 추출한다 
ch01 <- select(ch, c("행정구역별_읍면동",'code'))
str(ch01)

# 한글 인코딩을 바꾸어준다 
ch01
ch02 <- changeCode(ch01)
View(ch02)
ch02

#각각의 변수의 속성을 문자로 바꾸어 주어 데이터 병합을 준비하자
ch02$행정구역별_읍면동
ch02$행정구역별_읍면동 <- as.character(ch02$행정구역별_읍면동) 
ch02$code <- as.character(ch02$code)

# 데이터 병합을 위해 정렬시킨다 
ch03 <- arrange(ch02, ch02$행정구역별_읍면동)

## gu 변수명을 '행정구역별_읍면동'으로 바꾸어준다
seoulch10
seoulch11 <- rename(seoulch10, '행정구역별_읍면동'='gu','치킨집수'=Freq)
View(seoulch11)
str(seoulch11)

# 마찬가지로 속성을 바꾸어 주어 병합을 준비한다 
#character속성으로 바꾸어준다.
seoulch11$행정구역별_읍면동 <- as.character(seoulch11$행정구역별_읍면동)
# 필요없는 부분을 삭제한다 
seoulch12 <- select(seoulch11, -seoulch08)
str(seoulch12)

# 정렬한다 
#정렬하여 오차없이 매칭해준다.
seoulch13 <- arrange(seoulch12, seoulch12$행정구역별_읍면동)
View(seoulch13)

# 데이터를 병합한다 
#korpop2 ->ch03 정렬되여 있는 것을 seoulch13정렬되여 있는 것과 합친다.
seoulch14 <- cbind(ch03,seoulch13)
seoulch14

# 정렬한다 
# 합친 seoulch14것을 마지막 하나를 뺀다.
seoulch15 <- seoulch14[,-1]
View(seoulch15)

# 시각화 
ggChoropleth(seoulch15,kormap2,fillvar="치킨집수",interactive=TRUE,subarea='서울',tooltip="행정구역별_읍면동",title='서울시 구별 치킨집수'  )

'Study > R' 카테고리의 다른 글

R-9 (0)	2020.09.05
R-8 (0)	2020.09.05
R-6 (0)	2020.09.05
R-5 (0)	2020.09.05
R-4 (0)	2020.09.05

R-6

2020. 9. 5. 14:14

728x90

install.packages("nycflights13")
library(nycflights13)
data(package = "nycflights13")
4개의 data set이 있다.
Data sets in package ĄŽnycflights13ĄŻ:

airlines                            Airline names.
airports                            Airport metadata
flights                             Flights data
planes                              Plane metadata.
weather                             Hourly weather data

install.packages("nycflights13")
library(nycflights13)
data(package = "nycflights13")

flights
#행의 개수

dim(flights)
str(flights)
nrow(flights)
flights #tibble 이여서 몇개만 보여주고 마지막에 개수 보여준다.
summary(flights)#행의 개수 안보임

fly1 <- flights
summary(fly1)
fly1 <- as.data.frame(fly1)
summary(fly1)#행의 개수  class를 보기

#flights 데이터셋에서 열이 이름(변수명)
colnames(flights)
names(flights)

#1월 1일 데이터는 모두 몇개 입니까 ?
library(dplyr)
flights %>% filter(month == '1' & day == '1' ) %>% nrow

flights

#도착 지연 시 이 2시간 이하인 항공편은 모두 몇 회입니까 ?
flights %>% filter(arr_delay < 120) %>% nrow

#출발시간과 도착시간이 모두 1시간 이상 지연된 항공편은 모두 몇 회입니까?
flights %>% filter(dep_delay >= 60 & arr_delay >= 60) %>% nrow

#filter(dep_delay >= 60 & arr_delay >= 60)->flight,origin,dest,arr_delay,dep_delay,distance,air_time조회해서 데이터셋 만들기 
flights_new <- flights %>% filter(dep_delay >= 60 & arr_delay >= 60) %>% select(flight,origin,dest,arr_delay,dep_delay,distance,air_time ,arr_time)
flights_new

#도착지연은 오름차순 출발지연은 내림차누
flights_new %>% arrange(arr_delay,desc(dep_delay))#앞에것 하고 동급 있을때 내림차순으로 한다.

#gain speed구하기 
flights_new %>% mutate( gain = arr_delay- dep_delay, speed = distance/air_time )

flights %>% nrow#336776
flights <- na.omit(flights)#na없에기 
flights %>% nrow#327346
flights %>% summarize(delay= mean(dep_delay), delay_sd = sd(dep_delay))

#flights에서 12개월가 월별 평균출발시간(dep_time)을 구하세요
flights
flights %>% group_by(month) %>% summarise(dep_mean = mean(dep_time))
#'mean(dep_time)' 이름을 지정하지 않았다.


flights %>% distinct(month)
str(flights)
summary(flights)
summary(airquality)
library(ggplot2)
summary(mpg)

크롤링이란   (Crawling)  ->데이터 전처리
크롤링(Crawling)이란?
- 스크레이핑(Scraping)이라고도 하며 웹 페이지를 그대로 가져와서 거기서 데이터를 추출해내는 행위이다.
크롤링(Crawling)은 불법일까?
- 크롤링하여 얻은 데이터를 개인 하드에 소장하는 것까지는 합법이다.
하지만 배포하면 그 순갂부터 합법과 불법이 갈린다.
- 합법적인 크롤링은 사이트 운영자의 의사에 반하지 않는 것이다.
- 불법적인 크롤링은 사이트 운영자의 의사에 반하거나 실정법을 어기는 것이다.
- 크롤링은 웹페이지의 소스가 포함되어 있다. 이런 소스들은 웹프로그래밍 저작물이며
이를 불법 복제하는 것은 위법이다.
크롤링(Crawling)을 위한 기본 개념을 알아보자.
- 서버 : 외부에서 요청하면 규칙대로 정보를 제공하는 역할을 한다.
- 브라우저 : 서버가 주는 것들을 사용자에게 보여준다.
크롤링(Crawling)을 위한 기본 개념을 알아보자.
- 웹 서버는 text(html, css, js 등)와 image를 브라우저를 통해 사용자에게     보여준다
준비물

1. 구글크롬 2. Selector Gadget 3. 엑셀

• 방법

1. 구글크롬을 설치한다.

2. https://selectorgadget.com/에서 들어가서 북마크에 설치한다.

SelectorGadget: point and click CSS selectors

SelectorGadget:point and click CSS selectors SelectorGadget Screencast from Andrew Cantino on Vimeo. SelectorGadget is an open source tool that makes CSS selector generation and discovery on complicated sites a breeze. Just install the Chrome Extension or

selectorgadget.com

3. 다음에 들어간다.
4. 검색 키워드로 서치한다.
5. 원하는 컨테츠를 선택한다.(ex. 뉴스, 블로그, 카페)
6. 2페이지에 갔다가 다시 1페이지로 돌아온다.

install.packages("rvest")
library(rvest)
install.packages("stringr")
library(stringr)

title = c()#빈상태에서 접여여기
body = c()

#주소가 자동으로 바꾸게 넣어줘야 한다.
url_base ="https://search.daum.net/search?w=news&q=%EA%B5%AD%ED%9A%8C&DA=PGD&spacing=0&p="
url_crawl = paste(url_base);#paste는 문자 열 둘개를 붙이는 함수 
# url_crawl = paste(url_base, i )#url주소와 i를 붙여라 
print(url_crawl,1)
hdoc = read_html(url_crawl)
hdoc

for(i in 1:10){#i가 1부터 진행
  #url_crawl = paste(url_base, i , sep ="");#paste는 문자 열 둘개를 붙이는 함수 
  url_crawl = paste(url_base, i , sep ="");
 # url_crawl = paste(url_base, i )#url주소와 i를 붙여라 
  print(url_crawl)
  #t_css = ".f_link_b"#소제목으로 되여있다.F12로 하면 볼수 있다. .은 class를 표현하는 것이다.
  #b_css = ".desc"#
  #ie에서 F12로 하면 볼수 있다. .은 class를 표현하는 것이다
  #구글 selectorGadget에서 볼수 있다.
  
  hdoc = read_html(url_crawl)#그 경로에 있는 것을 읽어온다.가져오는 것
  
  
  #t_node = html_nodes(hdoc, t_css)#사진  ,text중 다르다.판단하는 함수가 
  #b_node = html_nodes(hdoc, b_css)
  t_node = html_nodes(hdoc, ".f_link_b")
  b_node = html_nodes(hdoc, ".desc")
  
  title_part = html_text(t_node)#글자를 가져오는 함수 달려가서 가져와야 한다.
  body_part = html_text(b_node)
  
  title = c(title,title_part)
  # title = c() vector형태로 되여있다.
  body = c(body,body_part)
}
news = cbind(title,body)
news
write.csv(news,"crawltest.csv")

텍스트마이닝
빈도분석 • 감성분석 • 한국어처리 KoNLP
rjAVA는 한글에서 필요하는것
RjAVA있어야 한

library(KoNLP)
library(wordcloud2)

useSejongDic()

text = readLines("textMining\\ahn.txt")
text

nouns <- extractNoun(text)#명사만 뽑아내는 것
nouns
#F1extract Nouns from Korean sentence uses Hannanum analyzer
#F2함수 내용  HannanumObj한글 페키지 
#"그렇습니다. 미래는 지금 우리 앞에 있습니다. " ->"미래" "우리" "앞"

nouns <- unlist(nouns)#LIST아니게 바꾼다.->1차원형태로 VECTOR형태로
nouns

nouns <- nouns[nchar(nouns)>=2]#nouns안에 있는 글자의 개수가 2보다 큰것 
nouns#nouns 2글자 이상인것만 한글자 지웠다.

#빈도를 새는것 
wordFreq <- table(nouns)
wordFreq <- sort(wordFreq, decreasing = T)#숫자가 많이 나온것을 정렬
wordFreq <- head(wordFreq, 20)
wordFreq

wordFreq <- table(nouns) %>% sort(decreasing = T) %>% head(20)
wordFreq
wordcloud2(wordFreq,fontFamily = '맑은 고딕')

useSejongDic()
nouns <- readLines("textMining\\leesungman.txt",encoding = "UTF-8") %>% extractNoun() %>% unlist()
nouns <- readLines("textMining\\leesungman.txt") %>% extractNoun() %>% unlist()
nouns <- nouns[nchar(nouns)>= 2]
nouns
wordFreq <- table(nouns) %>% sort(decreasing = T) %>% head(20)
wordcloud2(wordFreq,fontFamily = '맑은 고딕')
#jeonduhwan.txt
#kimdaejung.txt
#leesungman.txt 'textMining\leesungman.txt'에서 불완전한 마지막 행이 발견되었습니다
#roh.txt parkjunghee1.txt,leesungman.txt->그냥하면 된다.

#가장 많이 사용된 단어 알아보기 
txt = readLines("news.txt")
head(txt)

library(stringr)
extractNoun("대한민국의 영토는 한반도와 그 부속서로 한다.")
nouns <- extractNoun(txt)

wordcount <- table(unlist(nouns))
df_word <- as.data.frame(wordcount,stringsAsFactors = F)

df_word <- rename(df_word, word= Var1, freq = Freq)
df_word <- filter(df_word,nchar(word)>=2)

top20 <- df_word %>% arrange(desc(freq)) %>% head(20) 
top20

#useNIAdic
txt <- str_replace_all(txt,"//w","") #정규표현식

빈도가 많이 쓰여진다.
감성 분석
웹 사이트와 소셜미디어에 나타난 소비자의 감성을 붂석하여 유용한 정보로 재가공하는 기술을 의미한다.
문장에서 사용된 단어의 긍정과 부정의 빈도에 따라 문장의 긍정, 부정을 평가한다.
사람이 작성한 텍스트 앆에는 그 글의 주요 대상이 되는 주제(Topic)와 주제에 대한 글쓴이의 의견(Opinion)이 있다.
감성 붂석은 주제에 대한 글쓴이의 의견을 파악하는 것으로 Opinion Mining이라고도 한다

list.of.packages <- c("","","","")
new.packages <- list.of.packages()

install.packages("twitteR")
install.packages("ROAuth")
install.packages("plyr")
install.packages("stringr")
library(twitteR)
library(ROAuth)
library(plyr)
library(stringr)

#키
API_key <- ""
API_secret <- ""
access_token <- ""
access_secret <- ""

#object로 할수 있다. save(df_midterm,file="df_midterm.rda")->1일 날 것
#load("df_midterm.rda")

setup_twitter_oauth(counsumer_key = API_key, consumer_secret = API_secret,access_token = access_token,access_secret = access_secret)


#전 세계에서 
install.packages("ggmap")
library(ggmap)
library(ggplot2)

html.airports <- read_html("https://en.wikipedia.org/wiki/List_of_busiest_airports_by_passenger_traffic")
html.airports
df <- html_table(html_nodes(html.airports, "table")[[1]],fill= T) #[[]]list중에서 첫번째것 
#html_table 표 가지로 가는 것 
#html_nodes 노드 가지로 가는 것 

head(df)
head(df$Airport)
colnames(df)
colnames(df)[6] <- "total" #있으면 보여주고 없으면 만들어준다.
df$total
df$total <- gsub(',','',df$total)#정규표현식 할떄 나오는 것 g는 global하는 것 
#전부다 대체 하는 것이다. ,를 찾아서 ''으로 대체 한다.
df$total <- as.numeric(df$total)#character->숫자로 바꾸는 것 
df$total

Github에 있는 twitteR 패키지를 설치하기 위해 devtools 패키지를 설치한다.
rjson, bit64, httr은 의존성으로 함께 설치가 필요하다.
ROAuth 패키지는 트위터의 권한을 얻기 위해 설치가 필요하다.

* 패키지 설치 후 library() 로 로드한다. twitter 패키지는 install_github() 함수를 사 용하며
username은 자싞의 Desktop 사용자 이름으로 정의한다.

* 앞서 설명했던 API_key, API_secret, access_token, access_secret 을 복사 붙여넣기 한다.

* 저장했던 4개의 객체를 옵션으로 넣고 함수를 실행한다.

* searchTwitter() 함수를 이용하여 @apple 를 검색한 자료를 가져온다. since, until로 검색 기갂을 설정하고, lang로 언어를 설정할 수 있다.

* 500개를 설정했지만 실제 자료는 300개가 출력됐다. 해당 키워드에 일치되는 결과가 300개이기 때문

유투브에서 보기
3blue1brown
cs230
cs231n

knitr::kable(anscombe)
anscombe.1 <- data.frame(x = anscombe[["x1"]], y = anscombe[["y1"]], Set = "Anscombe Set 1")
anscombe.2 <- data.frame(x = anscombe[["x2"]], y = anscombe[["y2"]], Set = "Anscombe Set 2")
anscombe.3 <- data.frame(x = anscombe[["x3"]], y = anscombe[["y3"]], Set = "Anscombe Set 3")
anscombe.4 <- data.frame(x = anscombe[["x4"]], y = anscombe[["y4"]], Set = "Anscombe Set 4")
anscombe.data <- rbind(anscombe.1, anscombe.2, anscombe.3, anscombe.4)
aggregate(cbind(x, y) ~ Set, anscombe.data, mean)
aggregate(cbind(x, y) ~ Set, anscombe.data, sd)
model1 <- lm(y ~ x, subset(anscombe.data, Set == "Anscombe Set 1"))

model2 <- lm(y ~ x, subset(anscombe.data, Set == "Anscombe Set 2"))

model3 <- lm(y ~ x, subset(anscombe.data, Set == "Anscombe Set 3"))

model4 <- lm(y ~ x, subset(anscombe.data, Set == "Anscombe Set 4"))
library(plyr)

correlation <- function(data) {
  
  x <- data.frame(r = cor(data$x, data$y))
  
  return(x)
  
}

ddply(.data = anscombe.data, .variables = "Set", .fun = correlation)

summary(model1)

summary(model2)

summary(model3)

summary(model4)

library(ggplot2)
gg <- ggplot(anscombe.data, aes(x = x, y = y))

gg <- gg + geom_point(color = "black")

gg <- gg + facet_wrap(~Set, ncol = 2)

gg <- gg + geom_smooth(formula = y ~ x, method = "lm", se = FALSE, data = anscombe.data)

gg

#package있는지 없는지
install.packages("datasauRus")
library(datasauRus)
if(requireNamespace("dplyr")){
  suppressPackageStartupMessages(library(dplyr))
  datasaurus_dozen %>% 
    group_by(dataset) %>% 
    summarize(
      mean_x    = mean(x),
      mean_y    = mean(y),
      std_dev_x = sd(x),
      std_dev_y = sd(y),
      corr_x_y  = cor(x, y)
    )
}
if(requireNamespace("ggplot2")){
  library(ggplot2)
  ggplot(datasaurus_dozen, aes(x=x, y=y, colour=dataset))+
    geom_point()+
    theme_void()+
    theme(legend.position = "none")+
    facet_wrap(~dataset, ncol=3)
}
#reshap2가 없을때 
if(!require(reshape2)){
  install.packages("reshape2")
  require(reshape2)
}

비지도학습-군집분석
1.군집분석이란?
2.k-means
3.적정 k의 값
k는 숫자이다.
분류 vs 군집화
분류: 범주의 수 및 각 개체의 범주 정보를 사전에 알 수 있으면, 입력 변수 값으로부터 범저 정보를 유추하여 새로운 개체에 대해 가장
적합한 범주로 할당하는 문제(지도 학습)
군집화(clustering) 군집의 수 ,속성 등이 사전에 알려져 있지 않으면 최적의 구분을 찾아가는 문제(비지도 학습)
k는 숫자이다.
#회귀문제

if(!require(caret)){
  install.packages("caret")
  require(caret)
}

data("iris")
View(iris)

set.seed(123)
inTrain <- createDataPartition(y = iris$Species, p = 0.7, list = F)
training <- iris[inTrain,]
testing <- iris[-inTrain,]
training
#표준화
training.data <- scale(training[-5])#데이터 형태 맞춘다.
training.data
summary(training.data)

iris.kmeans <- kmeans(training.data[,-5],center = 3, iter.max = 10000)
iris.kmeans$centers

의사결정 나무
의사결정나무 정형데이터 나무 시리즈
다양한 도형을 분류하는 의사 결정 나무
검정색 찾을 때 까지 얼마나 가야 만 찾을 수 있는지

if(!require(NbClust)){
  install.packages("NbClust")
  require(NbClust)
}
#값이 정해진 것이 아니다.맘데로 정할 수 있다.
nc <- NbClust(training.data, min.nc = 2, max.nc = 15, method = "kmeans")

barplot(table(nc$Best.n[1,]),xlab ="Number of Clusters",ylab ="Number of chosen",main="Number of Clusters chosen")

training.data <- as.data.frame(training.data)#레ㅌ프트 생성한다.
modFit <- train(x = training.data[,-5],y = training$cluster, method="rpart")
#의상결정으로 분류하는 것  rpart
#rpart 반복적으로 20번씩 계속 해주는 것 
#분집을 나누고 의사결정 모데을 만들어서 예측해서 
#정답하고 결과가 얼마나 차이나는 가 
testing.data <- as.data.frame(scale(testing[,-5]))
testingclusterPred <- predict(modFit, testing.data)
table(testingclusterPred,testing$Species)

entropy 순도를 높여가는 양식
다른 변수인 소득을 기준으로 정렬하고 다시 같은 작업을 반복
entropy낮은 기준에서 나눈다.
순도를 높이기 위해 계속 나눈다.

계층적 방법
dendrogram 계층적분석 방법이다. ggplot에서

knn분석

ribbon
#ribbon
huron <- data.frame(year = 1875 : 1972 , level = as.vector(LakeHuron))
huron
ggplot(data = huron, aes(x= year))+geom_area(aes(y = level))
p <- ggplot(data = huron, aes(x = year))
p <- p+geom_area(aes(y = level))
p + coord_cartesian(ylim = c(570,590))#limit줘서 해줘야 한다.


p <- ggplot(data = huron, aes(x= year))
p <- p+geom_area(aes(y = level))
p +coord_cartesian(ylim = c(min(huron$level)-2,max(huron$level)+2))

 
p <- ggplot(huron,aes(x= year))+geom_ribbon(aes(ymin= min(level)-2,ymax = level+2))

p <- ggplot(huron, aes(x= year)) +geom_ribbon(aes(ymin = min(level)-2, ymax = level+2))

p <- ggplot(huron,aes(x = year,fill="skyblue"))+geom_ribbon(aes(ymin = min(level)-2, ymax = level+2,fill="skyblue"),color="skyblue")

p <- ggplot(huron, aes(x=year,fill="skyblue"))
p + geom_ribbon(aes(ymin=level-2, ymax=level+2 ,fill="skyblue"), colour="blue")

knn ->거리를 계싼하는 것
knn(k-Nearest Neighbor)새로운 돌아온 데이터와 그룹의 데이터와 가장 가까우니 새로운것과 재일 가까운 것은 그룹이다.->분류 알고리즘

k 인접 이웃 분류
overfitting 완전 최적화 데이터 under fit이 안되도록 학습할때 잘 안될 수 있다.
일반화 유지 그래서 적절한 것을 구해야 한다.
유클리드 거리 Euclidean distance L2거리 직선거리
맨하탄 거리 직선이 아닐때 L1거리

'Study > R' 카테고리의 다른 글

R-8 (0)	2020.09.05
R-7 (0)	2020.09.05
R-5 (0)	2020.09.05
R-4 (0)	2020.09.05
R-3 (0)	2020.09.05

R-5

2020. 9. 5. 14:04

728x90

이상치가 있으면 인지 하고
지우는지 범위를 변경하는지 알수 있다.
boxplot
상자기준으로
극단치
극단치 경계
윗수염: 하위 75 ~ 100%
3사분위(Q3) 하위 75%
2사분위(Q2) 하위 50% 중간값
1사분위(Q1) 하위 25%
아랫수염
극단치 경계

몇 %이상 들어가야만 사회가 안정적이다.
아니면 값이 이상치이다.
실제있는 데이터만 정확해서 수염의 길이를 표현하는 것있고 있는 것 값중에서 가장 작은 값을 표시한다.
있는데이터에서만 그려주니깐

library(ggplot2)
boxplot(mpg$hwy)
mpg$hwy <- ifelse(mpg$hwy < 12 | mpg$hwy > 37, NA, mpg$hwy)
table(is.na(mpg$hwy))

library(dplyr)
outlier <- data.frame(sex= c(1,2,1,3,2,1) , score= c(5,4,3,4,2,26))
outlier

table(outlier$sex)
table(outlier$score)

outlier$sex <- ifelse(outlier$sex == 3, NA, outlier$sex)#3은 이상치보다 오류이다.
outlier

outlier$score <- ifelse(outlier$score>5 , NA, outlier$score)#16
outlier

outlier <- data.frame(sex= c(1,2,1,3,2,1) , score= c(5,4,3,4,2,26))
boxplot(outlier$score)

outlier %>% filter(!is.na(sex) & !is.na(score)) %>% 
  group_by(sex) %>% 
  summarise(mean_score = mean(score))

mpg

ggplot(mpg,aes(drv, hwy))+geom_boxplot()
ggplot(mpg,aes(y =hwy))+geom_boxplot()
ggplot(mpg,aes(,hwy))+geom_boxplot()
mpg$drv
boxplot(mpg$drv)#숫자가 아니여서 못 그린다.
ggplot(data = mpg, aes(drv))+ geom_bar()

ggplot(mpg,aes(x = manufacturer, y = displ,colour = manufacturer,fill= "fill")) + geom_boxplot()
ggplot(mpg,aes(x = displ, y = manufacturer,colour = displ,fill= "fill")) + geom_boxplot()
#y는 숫자이여야만 높으를 줄수 있다.

na.rm =  T으로 권장드린다.

tibble -> data.frame 기본적인 것

3일차 분석사례실습 / 텍스트마이닝
한국복지패널데이터 분석
성별에 따른 월급 차이
성별 직업 빈도
종교 유무에 따른 이혼율
한국복지패널  데이터 분석
성별에 따른 월급 차이
-“성별에 따라 월급이  얼마나  다른가?”

install.packages("foreign")
library(foreign)#기본으로 깔려져있다.
library(dplyr)  
library(ggplot2) 
raw_welfare <- read.spss(file="Koweps_hpc10_2015_beta1.sav")
welfare <- as.data.frame(raw_welfare)#data를 편리하게 사용하려고 as.data.fram으로 나누었다.
str(welfare)#변수의 형태등 
glimpse(welfare)
head(welfare)
tail(welfare)
summary(welfare)

lit는 데이터 형에 아무른 변화가 없다.
파이썬 list는 vector에 가깝다.

raw_welfare <- read.spss(file="Koweps_hpc10_2015_beta1.sav")
welfare <- as.data.frame(raw_welfare)#data를 편리하게 사용하려고 as.data.fram으로 나누었다.
str(welfare)#변수의 형태등 
glimpse(welfare)
head(welfare)
tail(welfare)
summary(welfare)

#아래에 2가지 방법으로 가능하다.
#1.
welfare <- welfare %>% 
  rename( gender = h10_g3, birth = h10_g4,marriage = h10_g10, religion = h10_g11,
          income = p1002_8aq1, job = h10_eco9,
          region= h10_reg7) %>% 
  select(gender, birth, marriage, religion, income, job ,region)
welfare

#2.
welfare <- welfare %>% 
  select(gender = h10_g3, birth = h10_g4,marriage = h10_g10, religion = h10_g11,
         income = p1002_8aq1, job = h10_eco9,
         region= h10_reg7)
welfare

str(welfare)#numeric으로 변환한다.
summary(welfare)#gender이 이상하게 나온다.na값이 보여진다. 몇개있는지도 보여준다.
#숫자로
plot(welfare)#밑에와 위에 같은데 아래쪽 본다.
pairs(job ~ income+gender+region, data = welfare)
#줄로 된것은 1아니면 2밖에 없다. numeric이 아니라 범주형이다
#막대리 처럼 해지면 
boxplot(welfare)
#income을 따로 잡을수 있다.
sum(is.na(welfare))#21165
sum(welfare, na.rm = T)#38498929
colSums(is.na(welfare))#컬럼별로 sum을 한다.
summary(welfare$income)
mean(welfare$income)
mean(welfare$income,na.rm = T)#241.619
mean(is.na(welfare$income))#0.7219155

range(welfare$income)
range(welfare$income,na.rm = T)
welfare$income <- ifelse(welfare$income == 0 , NA, welfare$income)
#분석할때 소덕의 범위를 
#0이 아닌데 0으로 대답하는 분이 많다고 생각한다.
summary(welfare$income)
plot(welfare$income)#index가 row개수 

install.packages("psych")
library(psych)
describe(welfare)#descirbe 다른 관점에서 보여준다.
#다른 관점에서의 통계적 정보를 보여준다.

str(welfare)#16664
ggplot(data = welfare, aes(x= income))+geom_density()#밀도
#평균등으로 파악하기 힘든 집단이다.
#Removed 12044 rows containing non-finite values (stat_density). 
ggplot(data = welfare, aes(x = income))+geom_freqpoly()
#Removed 12044 rows containing non-finite values (stat_bin)

summary(welfare$gender)
welfare$gender <- ifelse(welfare$gender == 1, 'M','F')
summary(welfare$gender)#전에는 1,2 등으로 나왔는데 character로 바꿔졌다.
table(welfare$gender)#빈도수 세주는 것
ggplot(data = welfare, aes(x = gender))+geom_bar()
ggplot(data = welfare, aes(x = gender, colour = gender))+geom_bar()#테두리
ggplot(data = welfare, aes(x = gender,fill =gender))+geom_bar()#테두리
ggplot(data = welfare, aes(x = gender))+geom_bar(aes(fill =gender))#테두리
barplot(table(welfare$gender), xlab="gender" , ylab = "count",col=rainbow(3))  #barplot count를 한것으로 해야 한다. 그래서 table한 상태로 들어가야 한다.

welfare %>% select(gender, income) %>% group_by(gender) %>% summarise(평균= mean(income, na.rm = T))
welfare %>% filter( !is.na(income)) %>% group_by(gender) %>% summarise(평균= mean(income))
data_gender <- welfare %>% group_by(gender) %>% summarise(평균=mean(income,na.rm = T))
data_gender 
welfare %>% select(gender, income) %>% group_by(gender) %>% summarise(평균= mean(income, na.rm = T)) %>% 
  ggplot(aes(x= gender, y =평균,fill=gender))+geom_bar(stat = 'identity')
ggplot(data=data_gender, aes(x= gender, y =평균,fill = gender))+geom_bar(stat = 'identity')
#stat = 'identity'  이것 무조건 해줘야 한다. 아니면 오류난다.stat_count() must not be used with a y aesthetic

welfare %>% select(gender, income) %>% ggplot(aes(x= income, color =gender))+geom_density()

나이에 따른 소득 차이
-“몇 살에 수입이 가장 많은가?”

class(welfare$birth)
summary(welfare$birth)
qplot(welfare$birth)
#qplot을 쓰지 말라
boxplot(welfare$birth)#이상치를 확인하려면 boxplot을 보면 된다.
sum(is.na(welfare$birth))#결측치 확인 한 것
welfare$age <- 2015 - welfare$birth +1
#열이 없으면 새로 만든다.
summary(welfare$age)
plot(welfare$birth)
plot(table(welfare$birth))
barplot(welfare$birth)

welfare
#있던 데이터에서 정리한다. 나이별로 열을 하는 것을 만들었다.
age_income <- welfare %>% filter(!is.na(income)) %>% group_by(age) %>% summarise(mean_income= mean(income))
head(age_income)
ggplot(data=age_income, aes(x= age, y =mean_income))+geom_line()
ggplot(data = age_income ,aes(x= age,y=mean_income))+geom_point()
ggplot(data = age_income ,aes(x= age,y=mean_income))+geom_point(size=2, color= "blue")
ggplot(data = age_income ,aes(x= age,y=mean_income))+geom_point(size=2, color= "blue")+geom_line()
#layer익때문에 겹져진다.

연령대(세대)별 평균소득
#순서가 상관이 있다. 먼저 그리는 것에 겹쳐진다.

#세대별
install.packages("KoNLP")
library(KoNLP)#Checking user defined dictionary!

useSejongDic()

library(wordcloud2)
text1 = readLines("ahn.txt")
text1

#3: None
library(ggplot2)
library(dplyr)
welfare <- welfare %>% mutate(age_gen = ifelse(age <30,"young",ifelse(age<= 40,"g3",ifelse(age<=50 ,"g4",ifelse(age<= 60,"g5","old")))))
head(welfare,10)

table(welfare$age_gen)
qplot(welfare$age_gen)

age_gen_income <- welfare %>% group_by(age_gen) %>% summarise(mean_income = mean(income,na.rm = T))
age_gen_income

ggplot(data = age_gen_income,aes(x = age_gen, y = mean_income))+geom_col()
ggplot(data = age_gen_income, aes(x= age_gen, y = mean_income))+geom_bar(stat= "identity")
ggplot(data = age_gen_income, aes(x= age_gen, y = mean_income,fill=age_gen))+geom_bar(stat= "identity")
ggplot(data = age_gen_income, aes(x= age_gen, y = mean_income))+geom_bar(stat= "identity",aes(fill=age_gen))
ggplot(data = age_gen_income,aes(x= age_gen, y= mean_income))+geom_col(aes(fill=age_gen))+scale_x_discrete(limits= c("young","g3","g4","g5","old"))#순서대로  scale_x_discrete

나이와 성별에 따른 소득 차이
-“소득은 나이와 성별에 따라 어떻게 다른가?”


gender_income <- welfare %>% group_by(age_gen,gender) %>% 
  summarise(mean_income = mean(income,na.rm = T))
gender_income

ggplot(data = gender_income , aes(x = age_gen , y = mean_income, fill= gender))+geom_col()+scale_x_discrete(limits= c("young","g3","g4","g5","old"))
ggplot(data = gender_income , aes(x = age_gen , y = mean_income, fill= gender))+geom_col(position= "dodge")+scale_x_discrete(limits= c("young","g3","g4","g5","old"))#겹치지 않고 옆으로 진행

신경망
require 좀더 좋다. 없으면 알려준다.

library
data(iris)
str(iris)#불꽃데이터 ->변수
#'data.frame':	150 obs. of  5 variables:
#$ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... 꽃받침
#$ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
#$ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... 꽃잎
#$ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
#$ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ... 불꽃의 종류
temp <- c(sample(1:50, 30),sample(51:100,30),sample(101:150,30))
temp
iris.training <- iris[temp,]
iris.testing <- iris[-temp,]
library(nnet)
neuralNetResult <- nnet(Species~., data= iris.training, size = 3, decay =0 )#종을 맞추어봐라 나머지 너비,길이 맞춰보라 
neuralNetResult

summary(neuralNetResult)
#1번째 방법
library(devtools)
source_url('https://gist.githubusercontent.com/fawda123/7471137/raw/466c1474d0a505ff044412703516c34f1a4684a5/nnet_plot_update.r')
install.packages("reshape")
library(reshape)

#2번째 방법
library(clusterGeneration)
library(scales)
library(reshape)

plot.nnet(neuralNetResult)

pred <- predict(neuralNetResult, iris.testing, type = "class")
pred
real <- iris.testing$Species
table(real, pred)
summary(neuralNetResult)

iris 데이터를 잘 알아야 한다.

ggplot2::diamonds #diamonds는 ggplot2에 있는 것
datasets::iris#default 설치안해도 사용할 수 있는 것 

install.packages("ggplot2movies")
require(ggplot2movies)
movies
dim(movies)

첫번째 신경망을 로젠블랏의 퍼셉트론
퍼셉트론으 x이의 문제 가능하다.
처음에 들어간 데이터가 얼마인가
비선형함수이다.
#손글자

train <- read.csv("mnist_train.csv")
train[10,-1]#10번째 줄의 첫번째것 없에기
train[10,]#첫번때에 label이 있다.
train[10,1]#첫번때에 label이 있다.[1] 3
#3의 특징  예는 3이다.
train

dim(train)

#28* 28 PIXEL로 해서 783개로 해서 w값들이 바꾼다.
m = matrix(unlist(train[10, -1]), nrow = 28, byrow =T) #숫자가 RGB값이다.28라인 갈때마다 끊어진다.
#첫번째는 3이고 나머지는 rgb이다.
m

image(m, col = grey.colors(255))#image를 보고 싶을때 
write.csv(m,"mnist3.csv")

rotate <- function(x) t(apply(x, 2, rev))
par(mfrow = c(2,3))
lapply(1:6, function(x) image(
                      rotate(matrix(unlist(train[x,-1]),nrow = 28, byrow = TRUE)),
                      col= grey.colors(255),
                      xlab=train[x,1]
                      ))


par(mfrow = c(1,1))

#load caret library
install.packages("caret")
library (caret)

#createDataPartition( )은 데이터를 훈련 데이터와 테스트 데이터로 분할한다.
inTrain <- createDataPartition(train$label, p = 0.8, list=F)
#caret::createDataPartition(
#  y,          # 분류(또는 레이블)
#  times=1,    # 생성할 분할의 수
#  p=0.5,      # 훈련 데이터에서 사용할 데이터의 비율
#  list=TRUE,  # 결과를 리스트로 반환할지 여부. FALSE면 행렬을 반환한다.
#)
#0.8 곱해서 한다.
training<-train[inTrain,]#데이터를 저장한다.
testing<-train[-inTrain,]#데이터를 저장한다.
training
testing

write.csv(training, file ="train-data.csv", row.names = F)
write.csv(training, file ="test-data.csv", row.names = F)

install.packages("h2o")
library(h2o)
미부함수

local.h2o <- h2o.init(ip = "localhost", port = 54321, startH2O = TRUE, nthreads=-1)
training <- read.csv("train_data.csv")
testing <- read.csv("test_data.csv")
training
testing

training[,1] <- as.factor(training[,1])#범주형이다.

trData <- as.h2o(training)
trData[,1] <- as.factor(trData[,1])
tsData <- as.h2o(testing)
tsData[,1] <- as.factor(tsData[,1])

#unilx 시간 측정
start <- proc.time()
model.dl <- h2o.deeplearning(x = 2:785,
                             y = 1,
                             trData,
                             activation = "Tanh",#sigmoid등 으로 바꿀수 있다.
                             hidden=rep(160,5),
                             epochs = 20)
end <- proc.time()

diff=end-start
print(diff)

str(pred.dl.df)
pred.dl.df$predict 
pred.dl.df[1,1] 
table(test_labels[1,1],pred.dl.df[1,1]))

'Study > R' 카테고리의 다른 글

R-7 (0)	2020.09.05
R-6 (0)	2020.09.05
R-4 (0)	2020.09.05
R-3 (0)	2020.09.05
R -2 (0)	2020.09.05

R-4

2020. 9. 5. 13:59

728x90

library(dplyr)
library(ggplot2)
mpg
data1 <- mpg %>% filter(displ <= 4) %>% summarise(mean(hwy))
data1
data2 <- mpg %>% filter(displ >= 5) %>% summarise(mean(hwy))
data2 

data <- mpg %>% filter( displ <= 4 | displ >= 5) %>% mutate(group1 = ifelse(displ <= 4, 'a', ifelse(displ>=5,'b','c'))) %>% group_by(group1) %>% summarise(mean(hwy))
data

mpg %>% filter()

#mpg는 위에  10개만 보여준다.
head(airquality, 20)
head(mpg, 30) #강제로 10개만 보여준다.

#데이터 타입으로 바꾸는것 as.data.frame
airquality <- as_tibble(airquality)
airquality

mpg

mpg %>% filter(manufacturer=='audi' | manufacturer=='ford') %>% group_by(manufacturer) %>% summarise(test=mean(cty)) %>% arrange(desc(test)) 

mpg_new <-mpg %>% mutate(total = (hwy+cty)/2) %>% arrange(desc(total)) %>% head()        
d2 <-mpg %>% mutate(total = (hwy+cty)/2) %>% arrange(desc(total))  %>% head()  
d2 <- as.data.frame(d2)
arrange(d2,desc(total))

mpg_new <- mpg %>% select('class','cty')
mpg_new

mpg_new %>%  filter(class =='suv' | class =='compact') %>% group_by(class) %>% summarise(sum(cty)) 

mpg %>% select('class','cty') %>%  filter(class =='suv' | class =='compact') %>% group_by(class) %>% summarise(mean(cty))  
mpg %>% filter(class =='suv' | class =='compact')  %>% select('class','cty')  %>% group_by(class) %>% summarise(mean(cty)) 

mpg %>% filter(class =='compact') %>% group_by(manufacturer) %>% summarise(tot=n()) %>% arrange(desc(tot))

mpg데이터셋 보기
ggplot

str->str(mpg)

tbl_df , tbl and data.frame
tbl data.frame비슷하는데 계산하기 위해서 나오는 것
tbl_df
chr m num, int등으로 되여있다.
열의 이름을 보고 싶으면 names(mpg)
names(mpg)[8] #cty만 뜨게끔 몇번째 명을 지정한다.
str(mpg)
names(mpg)
names(mpg)[8] #cty만 뜨게끔 몇번째 명을 지정한다.

names(mpg)[8] <- 'city'
names(mpg)
mpg
names(mpg)[8] <- 'cty'
names(mpg)
mpg

dplyr :: glimpse(mpg)
mpg
names(mpg)[12]
mpg %>% select(c(names(mpg)[12] ))

mpg <- subset( mpg, select = -c(names(mpg)[12] ) )
mpg

mpg <- ggplot2::mpg
mpg
ggplot(data= mpg,aes(x= displ,y=hwy))+geom_point()

ggplot(mpg,aes(displ,hwy,colour = class))+geom_point()

ggplot(mpg, aes(displ,hwy))+geom_point((aes(colour ="blue")))# 바닥에 색갈이 었으면 두개 층이 겹쳐져서 원하는 색갈을 못가진다.레이어가 겁쳐서나오기  overwritting
ggplot(mpg, aes(displ,hwy))+geom_point(colour ="blue")#동급에서 색갈이 먹여지는데 

ggplot(mpg, aes(displ,hwy,colour = class))+geom_point() #색갈갈
ggplot(mpg, aes(displ,hwy,colour = trans))+geom_point()#
ggplot(mpg, aes(displ,hwy,colour = drv))+geom_point()#
ggplot(mpg, aes(displ,hwy,colour = cty))+geom_point()#

#shape도형형
ggplot(mpg, aes(displ,cty, shape=drv))+geom_point()
ggplot(mpg, aes(displ,cty, shape=class))+geom_point()
ggplot(mpg, aes(displ,cty, shape=trans))+geom_point()
ggplot(mpg, aes(displ,cty, shape=cty))+geom_point()#A continuous variable can not be mapped to shape

#size #많을 수록 많아진다.
ggplot(mpg, aes(displ, cty, size= cty)) +geom_point()
ggplot(mpg, aes(displ, cty, size= trans)) +geom_point()

ggplot(mpg, aes(displ, cty, size= cty)) +geom_point(colour ="red")
ggplot(mpg, aes(displ, cty, size= cty)) +geom_point(colour= cty)#객체 'cty'를 찾을 수 없습니다
ggplot(mpg, aes(displ, cty, size= cty)) +geom_point(aes(colour= cty))

#만약 size와 color를 다르게 주면 어떤 그림을 그려 낼까요 
ggplot(mpg, aes(displ ,cty, size = cty , color= drv))+geom_point()

ggplot(mpg,aes(cty,hwy))+geom_point()#점선


str(diamonds)
ggplot(diamonds,aes(carat,price))+geom_point()
ggplot(economics, aes(date, unemploy))+geom_line()#선으로 
ggplot(mpg,aes(cty))
ggplot(mpg,aes(cty))+geom_histogram()#1차원으로 하는 것것


ggplot(mpg,aes(cty))+geom_histogram(bins=10)#더 굵어진다.
ggplot(mpg,aes(cty))+geom_histogram(bins=20)#bins는 막대기 그림림

dia <- diamonds #이름 바꾸기 
class(dia)
dia
#ord order 순서가 정해진다.
#carat 크기 모양

#범주유형으로 색상 하기 
ggplot(diamonds,aes(carat,price, color=cut))+geom_point()
ggplot(diamonds,aes(carat,price, color=color))+geom_point()
ggplot(diamonds,aes(carat,price, color=clarity))+geom_point()

#r과 rstudio 속도 관련문제
r은 훨씬 빠르고 부드럽게 된다.

#facetting
#따로따로 보여주는 것
#범주용 데이터에 대하여

ggplot(mpg, aes(displ,hwy))+geom_point()+facet_wrap(~class)

geom_smooth()
ggplot(mpg,aes(displ,hwy))+geom_point()+geom_smooth()#곡선 범위 
ggplot(mpg,aes(displ,hwy))+geom_point()+geom_smooth(method="loess")#곡선 defaule  local지역을 쪼개서 연결하는 상황
ggplot(mpg,aes(displ,hwy))+geom_point()+geom_smooth(method="lm")#직선 범위  lw linear model 선형모델 

geom_boxplot()

#gitter 흩어주는 것인데 geom_violin()더 예쁘지는 것이다. 절반짤라서 
#geom_violin
ggplot(mpg, aes(drv, hwy)) +geom_violin()
ggplot(mpg,aes(drv,hwy))+geom_jitter()
ggplot(data = mpg, aes(x = drv, y = hwy))+
  geom_point(size = 2, position = "jitter")

geom_feqploy()
ggplot(mpg, aes(hwy)) +geom_freqpoly()#histogram그리고 그다음 그리는 것 
ggplot(mpg, aes(hwy)) +geom_freqpoly(bins = 20)#범위가 넓어진다.

geom_histogam()
ggplot(mpg, aes(displ,color= drv)) +geom_histogram(bindwidth= 0.5)#색상이 안된다.
ggplot(mpg, aes(displ,fill= drv)) +geom_histogram(bindwidth= 0.5)#
ggplot(mpg, aes(displ,fill= drv)) +geom_histogram(bindwidth= 0.5,position = "dodge")#한줄에 색사이 하나밖에 없다.


#geom_bar
#자주사용하는 것이다.
ggplot(mpg, aes(displ,fill= drv)) +geom_bar(position = "dodge")
ggplot(mpg, aes(displ,fill= drv)) +geom_bar(position = "fill")

ggplot(mpg, aes(manufacturer))+geom_bar()#변수가 하나일때는 stat안하면 자동으로count로 된다.
drugs <- data.frame(drug = c("a","b","c"), effect = c( 4, 9, 6))
ggplot(drugs, aes(drug, effect))+geom_bar(stat = "identity")#y값이 정해져있다.identity로무조건 설정해야 한다.
ggplot(drugs, aes(drug, effect))+geom_bar()#stat_count() must not be used with a y aesthetic.

ggplot(economics, aes(date, unemploy / pop))+geom_line()#알아서 계싼해서 만들어준다.
ggplot(economics, aes(date, unemploy))+geom_line()

ggplot(mpg, aes(drv, hwy)) +geom_boxplot()#박스가 25%에서 선을 거고 50% 선을 거 위에 25%에 선이 있다. 중앙값 
#잴 위에 있는 것은 이상값 동그라미 


mpg %>% filter(hwy < 20 & drv == 'f') 
mpg %>% filter(hwy < 25 & drv == 'f') %>% arrange(hwy)

데이터 프레임 합치기
결측치 not avaliable
데이터 정제하기[결측치] 데이터 비여있기

#na이냐 아니냐 
df <- data.frame(sex= c("M","F",NA,"M","F"),score=c(5,4,3,4,NA))
df
is.na(df)
table(is.na(df))

table(is.na((df$sex)))
table(is.na(df$score))

#na빼고 계산하면 안되기때문에 na로 되여있다.
mean(df$score)#[1] NA ->수학계산을 알수없다.
sum(df$score)#[1] NA->수학계산을 알수없다.

df %>% filter(is.na(score))#na인것만 골라낸다.
df %>% filter(!is.na(score))#na가 아닌것을 골라낸다.

#해결법은 아닌것만 모아서 계산하겠다.
df_nomiss <- df %>% filter(!is.na(score)) 
df_nomiss
#평균은 모두합쳐서 ,na빼고 , error
mean(df_nomiss$score) 
sum(df_nomiss$score)

df_nomiss <- df %>% filter(!is.na(score) & !is.na(sex))
df_nomiss

#na.omit생략한다. 지운다. na가보이면 그 행을 지운다. 
#na가 모이면 그 행을 삭제한다. 그래서 사용하면 안된다. 위험한 행위이다.
#다만 데이터가 엄청 많을때 무슨 영향을 미치는 지 알고 있을때 가능하다.
#위험한 행위이다.
df_nomiss2 <- na.omit(df)#모든 변수에 결측치 없는 데이터 추출
df_nomiss2

#아래것은 가장 권장한 결과이다.
#na가  있다는 것을 알고 있기에 원래 데이터가 수정되는 것이 아니여서 안정성이 있다.
mean(df$score, na.rm =  T)#결측치 제외하고 평균산출 
sum(df$score, na.rm = T)#결측치 제외하고 합계 산출
#결론은 어떻게 하냐 ? 입력 is.na na.rm na.omit
#결측치를 지우고 한것이다.

데이터 정제하기 [이상치] 이상한 이상이다.
이상치의 가장대표적인 예는 로또 1등이다. 분포에 끝에 있다는 것이지 나쁘는 것은 아니다.

outlier <- data.frame(sex= c(1,2,1,3,2,1) , score= c(5,4,3,4,2,26))
outlier

table(outlier$sex)
table(outlier$score)

outlier$sex <- ifelse(outlier$sex == 3, NA, outlier$sex)#3은 이상치보다 오류이다.
outlier

outlier$score <- ifelse(outlier$score>5 , NA, outlier$score)#16
outlier

outlier <- data.frame(sex= c(1,2,1,3,2,1) , score= c(5,4,3,4,2,26))
boxplot(outlier$score)

outlier %>% filter(!is.na(sex) & !is.na(score)) %>% 
  group_by(sex) %>% 
  summarise(mean_score = mean(score))

정규표현식
함수
인수가 없는 함수

#함수 ->변수선언할 필요없다.
#함수 ->

Minho <- function(){
  x <- 10
  y <- 20
  return (x*y)#돌려주는 값의 선언
}

ls()
Minho
Minho()

#인수가 있는 함수의 선언
Minho2 <- function(x,y){
  xx <- x
  yy <- y
  return (sum(xx,yy))#돌려주는 값의 선언
  #시스템이 정의한 특정 함수를 이용한 결과를 돌려줌
}

Minho2(2,3)

kaggle 
gitub

Minho3 <- function(x,y){
  x3 <- x+1
  y3 <- y+1
  x4 <- Minho2(x3, y3)#함수에서 함수를 부르는 경우 재귀호출이 가능하다.
  return(x4)
}
Minho3(2,4)

#결과를 화면에 반환하지 않고 변수에 할당
Minho4 <- function(){
  x <-  10
  y <-  10
  return(invisible(x*y)) # 결과값은 보여지지 않지만 변수에는 값이 들어간다.
}
Minho4()
result <- Minho4()
result

#함수 외부의 변수를 조작해야
rm(x)
x <- 70 #시스템 변수 x에 70을할당 
ls()
minho5 <- function(){
  x <- 10#함수내에서 사용하는 변수 
  y <- 20#함수내에서 사용하는 변수 
  
  x <<- 40 #시스템에서 사용하는 변수  x에 40을 할당
  return(x+y)
}
minho5()

minho5 <- function(){
  x <- 20#함수내에서 사용하는 변수 
  y <- 20#함수내에서 사용하는 변수 
  x+y
}

#return이 우선순위고 return이 없으면 마지막으로 된다.
minho5 <- function(){
  x <- 20#함수내에서 사용하는 변수 
  y <- 20#함수내에서 사용하는 변수 
  return(x+y)
  x-y
}

minho5()
minho5

sum1 <- 0 #변수를 설정
for(i in seq(1,10,by =1 )) sum1 <- sum1+i#1에서 10을 1단위로 차례로 넣고 결과 확인
sum1

sum1 <- 0
for(i in 1:5){
  for(j in 1:5){
    sum1 <- sum1 + i*j
  }
}
sum1

sum1 <- 0
for(i in 1:5){
  for(j in 1:5){
    sum1 <- sum1 + i*j
  }
}
sum1

sum1 <- 0 #변수를 설정
for(i in seq(1,10,by =1 )) sum1 <- sum1+i#1에서 10을 1단위로 차례로 넣고 결과 확인
sum1

sum1 <- 0
for(i in 1:5){
  for(j in 1:5){
    sum1 <- sum1 + i*j
  }
}
sum1

while문
sum2 <- 0
i <- 1
while(i <= 10){
  sum2 <- sum2+i;
  i <-i+1
}
sum2

#repeat
sum3 <- 0
i <- 1
repeat{
  sum3 <- sum3+i
  i <- i+1
  if(i>10) break#이 조건에 맞으면 탈출한다.
}
sum3 <- 0
i <- 0
repeat{
  if(i> 5) break
  j <- 0
  repeat{
    if(j > 5)break
    sum3 <- sum3+i*j
    j <- j+1
  }
  i <- i+1
}
sum3

sum1 <- 0
len <- 10
for(i in 1:len){
  sum1 <- sum1 + i
}
sum1

서포트 벡터 머신
데이터 성격 이해 ->탐색적 데이터 (ggplot2,baseR graphic,플롯 , 페어)->변수를 피처엔저니어링(필요한것만 추가 파생변수 등 만든다. 뉴테이트 변수를 빼거나 넣거나 하는것 )
->분석(예측,분류,CNN등)
회귀 최소제곱법 찾는 방법
  단순회귀 예측 연속성
  로지스트회귀 양자택 sigmoid함수 ->
  다중회귀
전문가시스템
의사결정에서의 시리즈

kaggle 알고리즘 -> 분석 의사결정시리즈

vector점이라고 생각하면 된다.
중간에는 decition boundary
margin
support vetors
선형분류 문제 ->데이터를 분류하는 선형 결정경계(Decstion boudary)
분류가능하도록 데이터를 변화시킨다.
함수로 해서 변화한다. 변형을 시킨다.
비선형 분류
소프트 마진
hypperparameter 로 튜닝하다.

playground.tensorflow.org/#activation=tanh&batchSize=10&dataset=circle&regDataset=reg-plane&learningRate=0.03&regularizationRate=0&noise=0&networkShape=4,2&seed=0.47074&showTestData=false&discretize=false&percTrainData=50&x=true&y=true&xTimesY=false&xSquared=false&ySquared=false&cosX=false&sinX=false&cosY=false&sinY=false&collectStats=false&problem=classification&initZero=false&hideText=false

Tensorflow — Neural Network Playground

Tinker with a real neural network right here in your browser.

playground.tensorflow.org

playground tensorflow

RBF(radial basis function)비선형 분류-> 잘 모르는 RBF
RBF(radial basis function)가우시안 커널
서포트 백터 머신의 장단점
장점
1. 분류문제나 회귀문제 동시에 쓸 수 있다.
2.사용하기 쉽다.
3. 예측의 정확도가 높다
단점
1.커널 hypperparameter 등을 위해 튜닝 과정이 필요하다.
2.선형 회귀/로지스틱 회귀와 다르게 신뢰 구간등을

svm 지원 library
e1071
kabR
kerlab

신경망은 rgb함수로
kernel 선형 아니면 마법사처럼 위에 올라갈것인지
분류가지고 예측한다.
데이터

#분류

install.packages("RCurl")
# load the library
library(RCurl)
# specify the URL for the Iris data CSV
urlfile <-'http://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data'
# download the file
downloaded <- getURL(urlfile, ssl.verifypeer=FALSE)
# treat the text data as a steam so we can read from it
connection <- textConnection(downloaded)
# parse the downloaded data as CSV
letters <- read.csv(connection, header=FALSE)
# preview the first 5 rows
colnames(letters)<-c("letter","xbox", "ybox","width","height",
                     "onpix","xbar","ybar","x2bar","y2bar",
                     "xybar","x2ybar","xy2bar","xedge","xedgey",
                     "yedge","yedgex")
View(letters)
str(letters)
head(letters)
write.csv(letters,"letters.csv")
#머신러닝은 기계로 배운다.

 test_split = 0.2
train_size = round((dim(letters)[1] *(1- test_split)))#20000* -0.8 test는 4만계
set.seed(20180621)#seed 값에 따라사  sample을 돈다.
train_index = sample(1:(dim(letters)[1]),train_size) #1~ 20000, 16000

letters_train <- letters[train_index,]
letters_test  <- letters[-train_index,] #train한것 나머지를 구한다.

install.packages("e1071")
library(e1071)#svm 지원하는 것 
#데이터 , 문자로 맞추고 변수는 그 나머지 모든 것
#fitting svm with linear kernel
letters_linear <- svm(letter~. , data = letters_train,kernel ="linear")
summary(letters_linear)

Call:
svm(formula = letter ~ ., data = letters_train, kernel = "linear")

Parameters:
   SVM-Type:  C-classification
SVM-Kernel:  linear
       cost:  1

Number of Support Vectors:  7147

( 225 456 371 133 484 436 213 209 234 183 313 202 239 287 235 298 346 236 240 335 240 196 175 280 458 123 )

Number of Classes:  26

Levels:
A B C D E F G H I J K L M N O P Q R S T U V W X Y Z

선형이여서 감마 값이 없다. 얼마나 멀리 있는지 가까이 있는지

Ubuntu ->windows에서 linux사용가능하다.

!hostname
#나를 위해서 만들어진 서버
7f22e97478ea

'Study > R' 카테고리의 다른 글

R-6 (0)	2020.09.05
R-5 (0)	2020.09.05
R-3 (0)	2020.09.05
R -2 (0)	2020.09.05
R-1 (0)	2020.09.02

R-3

2020. 9. 5. 13:50

728x90

filter
select
mutate
summarize
arrange

#관측치의 개수와 변수의 개수는 각각 몇 개입니까?
summary(airquality)
dim(airquality)
str(airquality)

#변수 각각에 대해 최솟값,최대값,중앙값,평균 등의 요약통계량을 한꺼번에 보고싶을때 쓰는 함수는 ?
summary(airquality)

library(dplyr)
#오존이  32q크고 ,바람은 9보다 작은 날은 모두 
airquality %>% filter(Ozone > 32 & Wind <9) %>% summarise(n())

airquality %>% select(Ozone , Wind , Temp , Month) %>% filter(Temp >= 80) %>% arrange(desc(Ozone)) %>%  head()

airquality %>% select(1 , 2 , 3 , 4)

airquality %>% select(1:4)

airquality %>% select(-2)


airquality %>% select(Ozone , Wind , Temp , Month) %>% group_by(Month) %>% summarise(ave= mean(Wind))
# summarise(ave= mean(Wind))
airquality %>% select(Ozone , Wind , Temp , Month) %>% group_by(Month) %>% summarise(ave= max(Wind))
#summarise(avg= mean(Wind))

airquality %>% filter(Wind >= 10) %>% group_by(Month) %>% summarise(avg= mean(Temp))

game <- read.csv("gamedata.csv")   #시간이 오래 걸린다.                                                                             
game


library(data.table)
data<- fread("gamedata.csv")

getwd()

dim(data)

library(readr)

data1 <- read_csv("gamedata.csv")
dim(data1)

head(data1)
summary(data1)

rm(data,data1)
rm(list=ls())

data <- fread("conveniencestore.csv",encoding = "UTF-8")
dim(data)

head(data)

data1 <- read_csv("conveniencestore.csv")#한글이 안깨진다. 알아서 코딩이 다 되여있다.
head(data1)

read.csv() #데이터 적을때 
fread
read_csv()#파일과 관계없이 잘 쓰여진다.

summary(data1)
summary(data)

빈도수  table

data <- sample(4, 29, replace = T)
data
table(data) #빈도수 
hist(data)#histogram
hist(table(data))# 붙어 있고 
barplot(data)
barplot(table(data))# 흩어져있다.
pie(table(data))#데이터를 tableㄹ 만들고 pie
table(data) %>% pie()
data %>% table() %>%  pie()

abline()
x-y평면에   y= a+bx 

저수준->위에 있을때 라인 text글자를 집여있다든지 예:abline() 그림을 그리지 않는다.
고수준->혼자서 그림을 그릴수 있다.

par(mfrow= c(1,1))
x <- c(2,3,2,3)
barplot(x)
fit <- lm(dist~speed, data= cars)
fit
plot(fit)
par(mfrow= c(2,2))
plot(fit)

abline(a= 40, b = 4, col ='red')

lty -> line type
lwd -> line weidth
col->색갈
v->vertical->수직
h->horisental ->수평
legend->범례

ggplot2예쁘게 보여주는 것
3. ggplot2 그래픽 패키지
ggplot2 패키지를 알아보자
gg grammer of Graphics
reticulater -> R studio에서 r처럼 사용하는 것
ggplot2
R graphcics cookbook
R science

www.r-graph-gallery.com/

www.ggplot2-exts.org/gallery/

더 다양한 시각화 https://plot.ly/r/

plotly는 Interactive 그래프를 그려주는 라이브러리입니다
Scala, R, Python, Javascript, MATLAB 등에서 사용할 수 있습니다

시각화를 위해 D3.js를 사용하고 있습니다
사용해보면 사용이 쉽고, 세렦된 느낌을 받습니다

mtcars
str(mtcars)
mtcars$cyl
library(data.frame)
mtcars$cyl <- as.factor(mtcars$cyl)
str(mtcars)

#캐릭터 pch(4,6,8)
plot(mpg ~ hp, data= mtcars, col= cyl, pch=c(4,6,8)[mtcars$cyl], cex=1.2)
legend("topright",legend= levels(mtcars$cyl),pch= c(4,6,8) , col = levels(mtcars$cyl))

library(ggplot2)
ggplot(mtcars, aes(x=hp,y=mpg,color= cyl, shape=cyl))+
  geom_point(size=3)

2+3
2단계는 80% 3에서는 30%
1.평면세팅
2.도형선택
3.라벨
4.테마
5.패싯
ggplot라는 부런다.
1.평면세팅 ggplot(data=,aes(x=,y=))
*ggplot(data = 데이터 셋명)
주요 함수 ggplot(data = 데이터 셋명) : 데이터를 불러오는 역할
mapping = aes(x = , y = ) : x축, y축의 꾸미기로 사용한다

geom_function() : 어떤 그래프를 그릴지 정하는 함수
mapping = aes(항목1=값1, 항목2=값2)
: geom_function() 의 옵션으로 꾸미기로 사용한다.

position(x, y), color(색상), fill(채우기), shape(모양), linetype(선 형태), size(크기) 등
팩터로 바구는 것

mpg
str(mpg)
names(mpg)
ggplot(data = mpg ,aes(x = displ , y = hwy))#단계 배경 설정(측)
ggplot(data = mpg ,aes(x = displ , y = hwy))+ geom_point() #배경에 산정도 추가
ggplot(data = mpg ,aes(x = displ , y = hwy))+ geom_point() + xlim(3,6) #x측 분위 3~6으로 지정
ggplot(data = mpg ,aes(x = displ , y = hwy))+ geom_point() + xlim(3,6) + ylim(10,30) #범주형있을때 색갈이 생긴다.
#여기는 왼쪽으로 모여있다.

#범주데이터 fator 3가지 형태로 바꿔는 것 
ggplot(data= mpg, aes(x = displ, y = hwy, color= drv,shape = drv))+geom_point(size=2)

ggplot(data= mpg, aes(x = displ, y = hwy, color= cty))+geom_point(size=2)

summary(mpg$cty)
 factor하면 범주
ggplot(data = mpg, aes(x = displ, y = hwy)) +geom_point(aes(color= class))

ggplot(data = mpg, aes(x = displ, y = hwy,color= class)) +geom_point(size = 3)
ggplot(data = mpg, aes(x = displ, y = hwy)) +geom_point(aes(color= class), size = 3)

p <- ggplot(data = mpg, aes( x= displ,y= hwy))
p + geom_point(aes(color=class))

q <- geom_point(aes(color = class))
p + q

geom_point  Scatterplot
geom_bar  Bar plot
geom_histogram  Histogram
geom_density  Prabablity distribution plot
geom_boxplot  Box and whiskers plot
geom_text  Textual annotations in a plot
geom_errorbar  Error bars

ggplot(data = mpg, aes(x = displ, y = hwy) )+geom_point(size = 2)
ggplot(data = mpg, aes(x = displ, y = hwy , shape= drv) )+geom_point(size = 2)
ggplot(data= mpg, aes(x = displ, y = hwy, color = drv))+geom_point(size = 2)
ggplot(data= mpg, aes(x = displ, y = hwy, color = drv, shape= drv))+geom_point(size = 2)

ggplot(data = mpg, aes(x = displ, y = hwy) )+geom_point(size = 2)+geom_smooth(method = "lm") #수자 보여준다.
ggplot(data = mpg, aes(x = displ, y = hwy , shape= drv) )+geom_point(size = 2)
ggplot(data= mpg, aes(x = displ, y = hwy, color = drv))+geom_point(size = 3)
ggplot(data= mpg, aes(x = displ, y = hwy, color = drv, shape= drv))+geom_point(size =3)+geom_smooth(method = "lm")

p2 <- ggplot(data= mpg, aes(x= displ, y = hwy, color= drv, shape= drv))+
  geom_point(size = 2)
p2

p2 + geom_smooth(method = "lm")
p2 + geom_smooth(method="lm")+theme_dark()

3. 테마 theme

p3 <- ggplot(data= mpg, aes(x= displ, y = hwy, color= drv, shape= drv))+
  geom_point(size = 2)+
  geom_smooth(method= "lm")
p3
p3 + theme_dark() #배경 까막게

p3 <- ggplot(data= mpg, aes(x= displ, y = hwy, color= drv, shape= drv))+
  geom_point(size = 2)+
  geom_smooth(method= "lm")
p3
p3 + theme_dark() #배경 까막게
p3 + theme_bw() # 배경 줄 
p3 + theme_classic() # 아무것도 없음

help(theme_bw)

p3 + theme_gray() #배경 grey
p3 + theme_linedraw() #line 걸어짐
p3 + theme_light() #선 연하게 
p3 + theme_minimal()#테두리 없어짐
p3 + theme_void()
p3 + theme_test()

r은 in memory 방식이기때문에 늦다.

install.packages("ggthemes")
library(ggthemes)
?ggthemes
p2 + theme_wsj() # 오랜지 등
p2 + theme_economist() #색상 연두색
p2 + theme_excel_new() # 엑셀처럼
p2 + theme_fivethirtyeight()#
p2 + theme_solarized_2()
p2 + theme_stata()

4. 라벨

ggplot( data = mpg, aes(x= displ, y = hwy , color = drv , shape = drv))+
  geom_point(size = 2)+
  geom_smooth(method= "lm")+
  labs(title = "<배기량에 따른 고속도로 연비 비교>", x ="배기량", y ="연비" )

5. facet
#면 분할 하은 방법
d <- ggplot(mpg, aes(x = displ, y = hwy , color = drv)) +
  geom_point()
d
d + facet_grid(drv ~ .) #div로 3개로 분할한다.
d + facet_grid(. ~ cyl) #cyl 에 의해서 분할하는 데 열로 분할하라
d + facet_grid(drv ~ cyl)

d + facet_grid( ~ class)
d + facet_wrap( ~ class) #정렬

d + facet_wrap( ~ class, nrow = 2) #행의 개수
d + facet_wrap( ~ class, ncol = 4) #열의 개수

ggplot(data = mpg, aes( x= displ, y = hwy, color = drv))+
  geom_point(size = 2)
ggplot(data = mpg, aes(x = displ, y = hwy, color = drv))+
  geom_point(size = 2, position = "jitter")
dplyr :: glimpse(mpg)
jitter는 모호하게 하는 것이다 값이 거의 최적화 댈때 뭉갠다.

p1 <- ggplot(data= mpg, aes( x= displ, y = hwy , color = drv))
p1 + geom_point(size =2 )
p1+ geom_line() #라인으로 연결
p1 + geom_point(size =2) +geom_line()

hist는 붙어있고 연속변수
막대그래프는 이상변수 떨어져있다.

ggplot( data = mpg, aes( x= displ)) +geom_bar()#y 없을 때 count
ggplot( data = mpg, aes( x= displ, fill = factor(drv))) + geom_bar()
ggplot( data = mpg, aes( x= displ, fill = factor(drv))) +geom_bar(position = "dodge")

#비율로
ggplot( data = mpg, aes( x = displ, fill = factor(drv))) + geom_bar(position = "fill")
ggplot( data = mpg, aes ( x = displ, fill = factor(drv))) + geom_bar(position= "fill")+facet_wrap(~class)#나누어서

ggplot( data = mpg, aes( x = displ))+ geom_histogram()
ggplot( data = mpg, aes( x= displ))+ geom_histogram(fill= "blue")
ggplot( data = mpg, aes( x= displ))+ geom_histogram(fill = "blue", binwidth = 0.1) #쫍아졌다.

library(ggplot2)
library(dplyr)
plot(mtcars)
attach(mtcars)#변수를 쓰겠다.
mtcars
wt
disp
plot(wt) #기본함수  x와 y 에 대한 것에 
mpg
plot(wt, mpg)
plot(wt, mpg, main="wt와 mpg의 관계계")
plot(wt, disp, mpg)#Error in plot.xy(xy, type, ...) : 유효한 플랏 타입이 아닙니다

library(scatterplot3d)
scatterplot3d(wt, disp, mpg, main ="3D sactter plot")
scatterplot3d(wt, disp, mpg, pch = 15, highlight.3d = TRUE, type ="h", main = "3D sactter plot" )

library(rgl)
plot3d(wt, disp, mpg)
plot3d(wt, disp, mpg , main = "wt vs mpg vs disp" , col ="red" , size = 10)

시각화중급

Boxplot
Scatterplot
Densityplot

box plot-데이터 분포도 알 수 있음 최소갓 최대값 중앙값->어디에 몰려있는지
abc <- c(110 , 300, 150, 280, 310)
def <- c(180, 200, 210, 190, 170)
ghi <- c(210, 150, 260, 210, 70)
boxplot(abc,def,ghi)

# col: 상자내부의색지정
# names: 각막대의이름지정
# range: 막대의끝에서수염까지의길이를지정
# width: 박스의폭을지정
# notch: TRUE이면상자의허리부분을가늘게표시
# horizontal: TRUE이면상자를수평으로그림

5가지 요약 수치 사용

abc <- c(110 , 300, 150, 280, 310)
def <- c(180, 200, 210, 190, 170)
ghi <- c(210, 150, 260, 210, 70)
boxplot(abc,def,ghi)
boxplot(abc,def,ghi, col= c("yellow","cyan","green"),name =c("BaseBall","SoccerBall","BaseBall"),horizontal=T)
summary(abc)
summary(def)
summary(ghi)

head(iris)
ggplot(iris, aes(x= Sepal.Length, y = Sepal.Width))+geom_point()

ggplot(iris, aes(x= Sepal.Length, y = Sepal.Width))+geom_point(color="red",fill ="blue",shape = 21, alpha = 0.5, size= 6, stroke = 2)
#alpha투명도 
#stroke 안에 동그라미테두리

ggplot(iris, aes( x = Sepal.Length, y = Sepal.Width, color = Species,shape= Species))+geom_point(size = 6, alpha = 0.5)
ggplot(iris, aes( x = Sepal.Length, y = Sepal.Width, color = Species,shape= Species))+geom_point(size = 3, alpha = 0.5)

data = head(mtcars,30)
ggplot(data,aes(x= wt, y = mpg))+geom_point()+geom_text(label= rownames(data),nudge_x = 0.25, nudge_y = 0.25,check_overlap = T)
#check_overlap겹치느나 안겹치느내
#nudge_x 동그라미 와 오른쪽 거리 
#nudge_y 동그라미와 위거리 

ggplot(data, aes(x = wt, y = mpg)) +geom_label(label = rownames(data),nudge_x = 0.25, nudge_y = 0.2)
#텍스트 둘래 박스 쳐준다.

ggplot(data, aes(x = wt, y = mpg,fill= cyl)) +geom_label(label = rownames(data),color="white",size= 5)
#박스와 다르다.

ggplot(data= iris, aes(x = Sepal.Length, y = Sepal.Width))+geom_point()+geom_rug(col= "steelblue",alpha = 0.1 , size = 1.5)
#테두리 가에 있는 것
#농도가 진해지면 수치가 많다. 분포

library(ggplot2)
install.packages("ggExtra")
library(ggExtra)
head(mtcars)
mtcars
mtcars$wt = as.factor(mtcars$wt)
mtcars$cyl = as.factor(mtcars$cyl)
mpg = as.factor(mpg)
str(mtcars)
ggplot(mtcars, aes(x = wt, y = mpg, color= cyl, size = cyl))+geom_point()+theme(legend.position = "none")
#legend.position = "none" 범례를 안보이게 하기 
ggplot(mtcars, aes(x = wt, y = mpg, color= cyl, size = cyl))+geom_point()

p <- ggplot(mtcars, aes(x = wt, y = mpg, color= cyl, size = cyl))+geom_point()+theme(legend.position = "none")
ggMarginal(p, type="histogram") #이력
ggMarginal(p, type="density") # 선
ggMarginal(p, type="boxplot") # boxplot
ggMarginal(p, type ="histogram", size = 10)#size 조정
ggMarginal(p, type = "histogram", fill="slateblue", xparams = list(bins= 10),yparams = list(bins = 10))

www.r-graph-gallery.com/

정지된것 은 plot
움직이는것 볼 수 있는 것은 창에서

data = data.frame(cond = rep(c("condition_1","condition_2"),each= 10), my_x = 1:100 +rnorm(100, sd= 9),my_y = 1:100 +rnorm(100,sd= 16))
data
#rep(c("condition_1","condition_2"),each= 10) 10번씩
#표준편차 sd
#정교분포
ggplot(data,aes( x= my_x, y = my_y))+geom_point(shape= 1)

#lm 직선 overfitting  
#se= T는 오류편차 주지 말라고 하는 것 범윌ㄹ 안아렬주고 대충알려준다.
ggplot(data, aes(x= my_x, y = my_y))+geom_point(shape= 1) +geom_smooth(method = lm, color="red" ,se= F)
ggplot(data, aes(x= my_x, y = my_y))+geom_point(shape= 1) +geom_smooth(method = lm, color="red" ,se= T)

a = seq(1,29)+4 * runif(29,0.4)
#runif 0~0.1
b = seq(1,29) ^ 2 +runif(29, 0.98)
library(dyplyr)
par(mfrow=c(2,2))#분할 로 해서 4개 그림 그린다.

plot(a,b, pch= 20)
plot(a-b, pch =18)
hist(a, border= F, col = rgb(0.2,0.2,0.8,0.7),main="")
#투명도 0.7
# 0.2 red 0.2 green 0.8 blue
boxplot(a, col ="grey", xlab="a")

install.packages("rattle")
library(rattle)
Temp3pm
cities <- c("Canberra","Darwin","Melbourne","Sydney")
ds <- subset(weatherAUS,Location %in% cities & !is.na(Temp3pm))#Location %in% cities합쳐주는 
p <- ggplot(ds, aes(Temp3pm, colour = Location, fill= Location))
p <- p_geom_denisity(alpha - 0.55)
p
View(weatherAUS)
# %in%속해있는지 
#subset(weatherAUS,Location %in% cities & !is.na(Temp3pm)) 행과열을 추출하는 것이다.

subset(weatherAUS,Location %in% cities & !is.na(Temp3pm))

data(diamonds)
head(diamonds)

ggplot(data = diamonds , aes(x = price, group = cut, fill= cut))+geom_density(adjust = 1.5)
ggplot(data = diamonds , aes(x = price, group = cut, fill= cut))+geom_density(adjust = 5)
#가격에 대해서 예상 이런조건이면 

ggplot(data = diamonds, aes(x= price, group = cut, fill= cut))+ geom_density(adjust = 1.5, alpha= 0.2)
ggplot(data = diamonds, aes( x= price, group = cut, fill= cut))+ geom_density(adjust = 1.5, position = "fill")#누적되서 나타나는 것
x1 = rnorm(100)
x2 = rnorm(100, mean = 2)
par(mfrow = c(2,1))

par(mar = c(0,5,3,3))
plot(density(x1),main="",xlab = "", ylim = c(0,1),xaxt = "n", las = 1, col = "slateblue1", lwd = 4)
par(mar= c(5,5,0,3))
plot(density(x2), main ="", xlab ="Value of my variable", ylim=c(1,0), las = 1, col="tomato3", lwd = 4)


diamonds
ggplot(data = diamonds , aes(x = depth, group = cut, fill= cut))+geom_density(adjust = 1.5)


data <- data.frame(name = c("north","south","south-east","north-west","south-west","north-east","west","east"),val=sample(seq(1,10),8))
data
mpg

install.packages("forcats")
library(forcats)
library(dplyr)
data %>% mutate(name = fct_reorder(name,val)) %>% ggplot(aes(x=name, y = val))+
  geom_bar(stat= "identity")+
  coord_flip() #오름차순 

data %>% mutate(name = fct_reorder(name, desc(val))) %>% ggplot(aes(x= name, y = val))+
  geom_bar(stat= "identity")+
  coord_flip() #desc 내름차순 

data <- data.frame(name = letters[1:5], value= sample(seq(4,15),5), sd = c(1,0.2,3,2,4))
ggplot(data) + geom_bar(aes(x= name, y = value), stat ="identity", fill ="skyblue", alpha= 0.7)+
  geom_errorbar(aes(x = name,ymin = value-sd, ymax = value+sd),width = 0.4 , colour ="orange", alpha = 0.9, size = 1.3)

ggplot(data)+
  geom_bar(aes(x= name, y = value), stat ="identity", fill ="skyblue", alpha = 0.5)+
  geom_crossbar(aes(x = name, y = value, ymin = value-sd , ymax = value+sd ), width = 0.4 , colour="orange", alpha = 0.9, size = 1.3)



ggplot(data)+
  geom_bar(aes(x= name, y = value), stat ="identity", fill ="skyblue", alpha = 0.5)+
  geom_linerange(aes(x = name, ymin = value-sd , ymax = value+sd ), width = 0.4 , colour="orange", alpha = 0.9, size = 1.3)

ggplot(data)+
  geom_bar(aes(x= name, y = value), stat ="identity", fill ="skyblue", alpha = 0.5)+
  geom_errorbar(aes(x = name, ymin = value-sd , ymax = value+sd ), width = 0.4 , colour="orange", alpha = 0.9, size = 1.3)+coord_flip()

'Study > R' 카테고리의 다른 글

R-6 (0)	2020.09.05
R-5 (0)	2020.09.05
R-4 (0)	2020.09.05
R -2 (0)	2020.09.05
R-1 (0)	2020.09.02

PREV 1 2 NEXT

NAIAHD

Study/R

R-10

'Study > R' 카테고리의 다른 글

R-9

'Study > R' 카테고리의 다른 글

R-8

'Study > R' 카테고리의 다른 글

R-7

'Study > R' 카테고리의 다른 글

R-6

'Study > R' 카테고리의 다른 글

R-5

'Study > R' 카테고리의 다른 글

R-4

'Study > R' 카테고리의 다른 글

R-3

'Study > R' 카테고리의 다른 글

+ Recent posts

티스토리툴바