반응형

filter
select
mutate
summarize
arrange

 

#관측치의 개수와 변수의 개수는 각각 몇 개입니까?
summary(airquality)
dim(airquality)
str(airquality)

 

#변수 각각에 대해 최솟값,최대값,중앙값,평균 등의 요약통계량을 한꺼번에 보고싶을때 쓰는 함수는 ?
summary(airquality)

library(dplyr)
#오존이  32q크고 ,바람은 9보다 작은 날은 모두 
airquality %>% filter(Ozone > 32 & Wind <9) %>% summarise(n())

airquality %>% select(Ozone , Wind , Temp , Month) %>% filter(Temp >= 80) %>% arrange(desc(Ozone)) %>%  head()

airquality %>% select(1 , 2 , 3 , 4)

airquality %>% select(1:4)

airquality %>% select(-2)


airquality %>% select(Ozone , Wind , Temp , Month) %>% group_by(Month) %>% summarise(ave= mean(Wind))
# summarise(ave= mean(Wind))
airquality %>% select(Ozone , Wind , Temp , Month) %>% group_by(Month) %>% summarise(ave= max(Wind))
#summarise(avg= mean(Wind))

airquality %>% filter(Wind >= 10) %>% group_by(Month) %>% summarise(avg= mean(Temp))

game <- read.csv("gamedata.csv")   #시간이 오래 걸린다.                                                                             
game


library(data.table)
data<- fread("gamedata.csv")

getwd()

dim(data)

library(readr)

data1 <- read_csv("gamedata.csv")
dim(data1)

head(data1)
summary(data1)

rm(data,data1)
rm(list=ls())

data <- fread("conveniencestore.csv",encoding = "UTF-8")
dim(data)

head(data)

data1 <- read_csv("conveniencestore.csv")#한글이 안깨진다. 알아서 코딩이 다 되여있다.
head(data1)

read.csv() #데이터 적을때 
fread
read_csv()#파일과 관계없이 잘 쓰여진다.

summary(data1)
summary(data)

빈도수  table

data <- sample(4, 29, replace = T)
data
table(data) #빈도수 
hist(data)#histogram
hist(table(data))# 붙어 있고 
barplot(data)
barplot(table(data))# 흩어져있다.
pie(table(data))#데이터를 tableㄹ 만들고 pie
table(data) %>% pie()
data %>% table() %>%  pie()

abline()
x-y평면에   y= a+bx 

저수준->위에 있을때 라인 text글자를 집여있다든지 예:abline() 그림을 그리지 않는다.
고수준->혼자서 그림을 그릴수 있다.

par(mfrow= c(1,1))
x <- c(2,3,2,3)
barplot(x)
fit <- lm(dist~speed, data= cars)
fit
plot(fit)
par(mfrow= c(2,2))
plot(fit)

abline(a= 40, b = 4, col ='red')

lty -> line type
lwd -> line weidth
col->색갈
v->vertical->수직
h->horisental ->수평
legend->범례

 

ggplot2예쁘게 보여주는 것
3.  ggplot2 그래픽 패키지 
ggplot2 패키지를 알아보자 
gg grammer of Graphics
reticulater -> R studio에서 r처럼 사용하는 것
ggplot2
R graphcics cookbook
R science

www.r-graph-gallery.com/

www.ggplot2-exts.org/gallery/

 

더 다양한 시각화 https://plot.ly/r/

plotly는  Interactive 그래프를 그려주는 라이브러리입니다 
Scala, R, Python, Javascript, MATLAB 등에서 사용할 수 있습니다 

시각화를 위해 D3.js를 사용하고 있습니다 
사용해보면 사용이 쉽고, 세렦된 느낌을 받습니다

 

mtcars
str(mtcars)
mtcars$cyl
library(data.frame)
mtcars$cyl <- as.factor(mtcars$cyl)
str(mtcars)

#캐릭터 pch(4,6,8)
plot(mpg ~ hp, data= mtcars, col= cyl, pch=c(4,6,8)[mtcars$cyl], cex=1.2)
legend("topright",legend= levels(mtcars$cyl),pch= c(4,6,8) , col = levels(mtcars$cyl))

library(ggplot2)
ggplot(mtcars, aes(x=hp,y=mpg,color= cyl, shape=cyl))+
  geom_point(size=3)

 

2+3
2단계는 80% 3에서는 30%
1.평면세팅
2.도형선택
3.라벨
4.테마
5.패싯
ggplot라는 부런다.
1.평면세팅 ggplot(data=,aes(x=,y=))
*ggplot(data = 데이터 셋명) 
주요 함수 ggplot(data = 데이터 셋명) : 데이터를 불러오는 역할 
mapping = aes(x = , y =  ) : x축, y축의 꾸미기로 사용한다 
 
geom_function() : 어떤 그래프를 그릴지 정하는 함수 
mapping = aes(항목1=값1, 항목2=값2)                   
: geom_function() 의 옵션으로 꾸미기로 사용한다. 
 
position(x, y), color(색상), fill(채우기), shape(모양), linetype(선 형태), size(크기) 등 
팩터로 바구는 것 

 

mpg
str(mpg)
names(mpg)
ggplot(data = mpg ,aes(x = displ , y = hwy))#단계 배경 설정(측)
ggplot(data = mpg ,aes(x = displ , y = hwy))+ geom_point() #배경에 산정도 추가
ggplot(data = mpg ,aes(x = displ , y = hwy))+ geom_point() + xlim(3,6) #x측 분위 3~6으로 지정
ggplot(data = mpg ,aes(x = displ , y = hwy))+ geom_point() + xlim(3,6) + ylim(10,30) #범주형있을때 색갈이 생긴다.
#여기는 왼쪽으로 모여있다.

#범주데이터 fator 3가지 형태로 바꿔는 것 
ggplot(data= mpg, aes(x = displ, y = hwy, color= drv,shape = drv))+geom_point(size=2)

ggplot(data= mpg, aes(x = displ, y = hwy, color= cty))+geom_point(size=2)

summary(mpg$cty)
 factor하면 범주
ggplot(data = mpg, aes(x = displ, y = hwy)) +geom_point(aes(color= class))

ggplot(data = mpg, aes(x = displ, y = hwy,color= class)) +geom_point(size = 3)
ggplot(data = mpg, aes(x = displ, y = hwy)) +geom_point(aes(color= class), size = 3)

p <- ggplot(data = mpg, aes( x= displ,y= hwy))
p + geom_point(aes(color=class))

q <- geom_point(aes(color = class))
p + q

geom_point  Scatterplot  
geom_bar  Bar plot 
geom_histogram  Histogram  
geom_density  Prabablity distribution plot  
geom_boxplot  Box and whiskers plot  
geom_text  Textual annotations in a plot 
geom_errorbar  Error bars  

 

ggplot(data = mpg, aes(x = displ, y = hwy) )+geom_point(size = 2)
ggplot(data = mpg, aes(x = displ, y = hwy , shape= drv) )+geom_point(size = 2)
ggplot(data= mpg, aes(x = displ, y = hwy, color = drv))+geom_point(size = 2)
ggplot(data= mpg, aes(x = displ, y = hwy, color = drv, shape= drv))+geom_point(size = 2)

ggplot(data = mpg, aes(x = displ, y = hwy) )+geom_point(size = 2)+geom_smooth(method = "lm") #수자 보여준다.
ggplot(data = mpg, aes(x = displ, y = hwy , shape= drv) )+geom_point(size = 2)
ggplot(data= mpg, aes(x = displ, y = hwy, color = drv))+geom_point(size = 3)
ggplot(data= mpg, aes(x = displ, y = hwy, color = drv, shape= drv))+geom_point(size =3)+geom_smooth(method = "lm")

p2 <- ggplot(data= mpg, aes(x= displ, y = hwy, color= drv, shape= drv))+
  geom_point(size = 2)
p2

p2 + geom_smooth(method = "lm")
p2 + geom_smooth(method="lm")+theme_dark()

 

3. 테마 theme

p3 <- ggplot(data= mpg, aes(x= displ, y = hwy, color= drv, shape= drv))+
  geom_point(size = 2)+
  geom_smooth(method= "lm")
p3
p3 + theme_dark() #배경 까막게

p3 <- ggplot(data= mpg, aes(x= displ, y = hwy, color= drv, shape= drv))+
  geom_point(size = 2)+
  geom_smooth(method= "lm")
p3
p3 + theme_dark() #배경 까막게
p3 + theme_bw() # 배경 줄 
p3 + theme_classic() # 아무것도 없음

help(theme_bw)

p3 + theme_gray() #배경 grey
p3 + theme_linedraw() #line 걸어짐
p3 + theme_light() #선 연하게 
p3 + theme_minimal()#테두리 없어짐
p3 + theme_void()
p3 + theme_test()

r은 in  memory 방식이기때문에 늦다.

install.packages("ggthemes")
library(ggthemes)
?ggthemes
p2 + theme_wsj() # 오랜지 등
p2 + theme_economist() #색상 연두색
p2 + theme_excel_new() # 엑셀처럼
p2 + theme_fivethirtyeight()# 
p2 + theme_solarized_2()
p2 + theme_stata()

 

4. 라벨 

ggplot( data = mpg, aes(x= displ, y = hwy , color = drv , shape = drv))+
  geom_point(size = 2)+
  geom_smooth(method= "lm")+
  labs(title = "<배기량에 따른 고속도로 연비 비교>", x ="배기량", y ="연비" )

 

5. facet
#면 분할 하은 방법
d <- ggplot(mpg, aes(x = displ, y = hwy , color = drv)) + 
  geom_point()
d
d + facet_grid(drv ~ .) #div로 3개로 분할한다.
d + facet_grid(. ~ cyl) #cyl 에 의해서 분할하는 데 열로 분할하라 
d + facet_grid(drv ~ cyl)

d + facet_grid( ~ class)
d + facet_wrap( ~ class) #정렬

d + facet_wrap( ~ class, nrow = 2) #행의 개수
d + facet_wrap( ~ class, ncol = 4) #열의 개수

ggplot(data = mpg, aes( x= displ, y = hwy, color = drv))+
  geom_point(size = 2)
ggplot(data = mpg, aes(x = displ, y = hwy, color = drv))+
  geom_point(size = 2, position = "jitter")
dplyr :: glimpse(mpg)
jitter는 모호하게 하는 것이다 값이 거의 최적화 댈때 뭉갠다.

 

geom_point  Scatterplot  
geom_bar  Bar plot 
geom_histogram  Histogram  
geom_density  Prabablity distribution plot  
geom_boxplot  Box and whiskers plot  
geom_text  Textual annotations in a plot 
geom_errorbar  Error bars  오차 바

 

p1 <- ggplot(data= mpg, aes( x= displ, y = hwy , color = drv))
p1 + geom_point(size =2 )
p1+ geom_line() #라인으로 연결
p1 + geom_point(size =2) +geom_line()

hist는 붙어있고  연속변수 
막대그래프는 이상변수 떨어져있다.

ggplot( data = mpg, aes( x= displ)) +geom_bar()#y 없을 때 count
ggplot( data = mpg, aes( x= displ, fill = factor(drv))) + geom_bar()
ggplot( data = mpg, aes( x= displ, fill = factor(drv))) +geom_bar(position = "dodge")

 

#비율로
ggplot( data = mpg, aes( x = displ, fill = factor(drv))) + geom_bar(position = "fill")
ggplot( data = mpg, aes ( x = displ, fill = factor(drv))) + geom_bar(position= "fill")+facet_wrap(~class)#나누어서 

ggplot( data = mpg, aes( x = displ))+ geom_histogram()
ggplot( data = mpg, aes( x= displ))+ geom_histogram(fill= "blue")
ggplot( data = mpg, aes( x= displ))+ geom_histogram(fill = "blue", binwidth = 0.1) #쫍아졌다.

 

library(ggplot2)
library(dplyr)
plot(mtcars)
attach(mtcars)#변수를 쓰겠다.
mtcars
wt
disp
plot(wt) #기본함수  x와 y 에 대한 것에 
mpg
plot(wt, mpg)
plot(wt, mpg, main="wt와 mpg의 관계계")
plot(wt, disp, mpg)#Error in plot.xy(xy, type, ...) : 유효한 플랏 타입이 아닙니다

library(scatterplot3d)
scatterplot3d(wt, disp, mpg, main ="3D sactter plot")
scatterplot3d(wt, disp, mpg, pch = 15, highlight.3d = TRUE, type ="h", main = "3D sactter plot" )

library(rgl)
plot3d(wt, disp, mpg)
plot3d(wt, disp, mpg , main = "wt vs mpg vs disp" , col ="red" , size = 10)

시각화중급

Boxplot
Scatterplot
Densityplot


box plot-데이터 분포도 알 수 있음 최소갓 최대값 중앙값->어디에 몰려있는지
abc <- c(110 , 300, 150, 280, 310)
def <- c(180, 200, 210, 190, 170)
ghi <- c(210, 150, 260, 210, 70)
boxplot(abc,def,ghi)

 

# col: 상자내부의색지정 
# names: 각막대의이름지정 
# range: 막대의끝에서수염까지의길이를지정 
# width: 박스의폭을지정 
# notch: TRUE이면상자의허리부분을가늘게표시 
# horizontal: TRUE이면상자를수평으로그림

 

5가지 요약 수치 사용

abc <- c(110 , 300, 150, 280, 310)
def <- c(180, 200, 210, 190, 170)
ghi <- c(210, 150, 260, 210, 70)
boxplot(abc,def,ghi)
boxplot(abc,def,ghi, col= c("yellow","cyan","green"),name =c("BaseBall","SoccerBall","BaseBall"),horizontal=T)
summary(abc)
summary(def)
summary(ghi)

head(iris)
ggplot(iris, aes(x= Sepal.Length, y = Sepal.Width))+geom_point()

ggplot(iris, aes(x= Sepal.Length, y = Sepal.Width))+geom_point(color="red",fill ="blue",shape = 21, alpha = 0.5, size= 6, stroke = 2)
#alpha투명도 
#stroke 안에 동그라미테두리

ggplot(iris, aes( x = Sepal.Length, y = Sepal.Width, color = Species,shape= Species))+geom_point(size = 6, alpha = 0.5)
ggplot(iris, aes( x = Sepal.Length, y = Sepal.Width, color = Species,shape= Species))+geom_point(size = 3, alpha = 0.5)

data = head(mtcars,30)
ggplot(data,aes(x= wt, y = mpg))+geom_point()+geom_text(label= rownames(data),nudge_x = 0.25, nudge_y = 0.25,check_overlap = T)
#check_overlap겹치느나 안겹치느내
#nudge_x 동그라미 와 오른쪽 거리 
#nudge_y 동그라미와 위거리 

ggplot(data, aes(x = wt, y = mpg)) +geom_label(label = rownames(data),nudge_x = 0.25, nudge_y = 0.2)
#텍스트 둘래 박스 쳐준다.

ggplot(data, aes(x = wt, y = mpg,fill= cyl)) +geom_label(label = rownames(data),color="white",size= 5)
#박스와 다르다.

ggplot(data= iris, aes(x = Sepal.Length, y = Sepal.Width))+geom_point()+geom_rug(col= "steelblue",alpha = 0.1 , size = 1.5)
#테두리 가에 있는 것
#농도가 진해지면 수치가 많다. 분포
library(ggplot2)
install.packages("ggExtra")
library(ggExtra)
head(mtcars)
mtcars
mtcars$wt = as.factor(mtcars$wt)
mtcars$cyl = as.factor(mtcars$cyl)
mpg = as.factor(mpg)
str(mtcars)
ggplot(mtcars, aes(x = wt, y = mpg, color= cyl, size = cyl))+geom_point()+theme(legend.position = "none")
#legend.position = "none" 범례를 안보이게 하기 
ggplot(mtcars, aes(x = wt, y = mpg, color= cyl, size = cyl))+geom_point()
p <- ggplot(mtcars, aes(x = wt, y = mpg, color= cyl, size = cyl))+geom_point()+theme(legend.position = "none")
ggMarginal(p, type="histogram") #이력
ggMarginal(p, type="density") # 선
ggMarginal(p, type="boxplot") # boxplot
ggMarginal(p, type ="histogram", size = 10)#size 조정
ggMarginal(p, type = "histogram", fill="slateblue", xparams = list(bins= 10),yparams = list(bins = 10))

www.r-graph-gallery.com/

정지된것 은   plot
움직이는것 볼 수 있는 것은 창에서 

 

data = data.frame(cond = rep(c("condition_1","condition_2"),each= 10), my_x = 1:100 +rnorm(100, sd= 9),my_y = 1:100 +rnorm(100,sd= 16))
data
#rep(c("condition_1","condition_2"),each= 10) 10번씩
#표준편차 sd
#정교분포
ggplot(data,aes( x= my_x, y = my_y))+geom_point(shape= 1)

 

#lm 직선 overfitting  
#se= T는 오류편차 주지 말라고 하는 것 범윌ㄹ 안아렬주고 대충알려준다.
ggplot(data, aes(x= my_x, y = my_y))+geom_point(shape= 1) +geom_smooth(method = lm, color="red" ,se= F)
ggplot(data, aes(x= my_x, y = my_y))+geom_point(shape= 1) +geom_smooth(method = lm, color="red" ,se= T)

a = seq(1,29)+4 * runif(29,0.4)
#runif 0~0.1
b = seq(1,29) ^ 2 +runif(29, 0.98)
library(dyplyr)
par(mfrow=c(2,2))#분할 로 해서 4개 그림 그린다.

plot(a,b, pch= 20)
plot(a-b, pch =18)
hist(a, border= F, col = rgb(0.2,0.2,0.8,0.7),main="")
#투명도 0.7
# 0.2 red 0.2 green 0.8 blue
boxplot(a, col ="grey", xlab="a")

install.packages("rattle")
library(rattle)
Temp3pm
cities <- c("Canberra","Darwin","Melbourne","Sydney")
ds <- subset(weatherAUS,Location %in% cities & !is.na(Temp3pm))#Location %in% cities합쳐주는 
p <- ggplot(ds, aes(Temp3pm, colour = Location, fill= Location))
p <- p_geom_denisity(alpha - 0.55)
p
View(weatherAUS)
# %in%속해있는지 
#subset(weatherAUS,Location %in% cities & !is.na(Temp3pm)) 행과열을 추출하는 것이다.

subset(weatherAUS,Location %in% cities & !is.na(Temp3pm))

data(diamonds)
head(diamonds)

ggplot(data = diamonds , aes(x = price, group = cut, fill= cut))+geom_density(adjust = 1.5)
ggplot(data = diamonds , aes(x = price, group = cut, fill= cut))+geom_density(adjust = 5)
#가격에 대해서 예상 이런조건이면 

ggplot(data = diamonds, aes(x= price, group = cut, fill= cut))+ geom_density(adjust = 1.5, alpha= 0.2)
ggplot(data = diamonds, aes( x= price, group = cut, fill= cut))+ geom_density(adjust = 1.5, position = "fill")#누적되서 나타나는 것
x1 = rnorm(100)
x2 = rnorm(100, mean = 2)
par(mfrow = c(2,1))

par(mar = c(0,5,3,3))
plot(density(x1),main="",xlab = "", ylim = c(0,1),xaxt = "n", las = 1, col = "slateblue1", lwd = 4)
par(mar= c(5,5,0,3))
plot(density(x2), main ="", xlab ="Value of my variable", ylim=c(1,0), las = 1, col="tomato3", lwd = 4)


diamonds
ggplot(data = diamonds , aes(x = depth, group = cut, fill= cut))+geom_density(adjust = 1.5)


data <- data.frame(name = c("north","south","south-east","north-west","south-west","north-east","west","east"),val=sample(seq(1,10),8))
data
mpg

install.packages("forcats")
library(forcats)
library(dplyr)
data %>% mutate(name = fct_reorder(name,val)) %>% ggplot(aes(x=name, y = val))+
  geom_bar(stat= "identity")+
  coord_flip() #오름차순 

data %>% mutate(name = fct_reorder(name, desc(val))) %>% ggplot(aes(x= name, y = val))+
  geom_bar(stat= "identity")+
  coord_flip() #desc 내름차순 

data <- data.frame(name = letters[1:5], value= sample(seq(4,15),5), sd = c(1,0.2,3,2,4))
ggplot(data) + geom_bar(aes(x= name, y = value), stat ="identity", fill ="skyblue", alpha= 0.7)+
  geom_errorbar(aes(x = name,ymin = value-sd, ymax = value+sd),width = 0.4 , colour ="orange", alpha = 0.9, size = 1.3)

ggplot(data)+
  geom_bar(aes(x= name, y = value), stat ="identity", fill ="skyblue", alpha = 0.5)+
  geom_crossbar(aes(x = name, y = value, ymin = value-sd , ymax = value+sd ), width = 0.4 , colour="orange", alpha = 0.9, size = 1.3)



ggplot(data)+
  geom_bar(aes(x= name, y = value), stat ="identity", fill ="skyblue", alpha = 0.5)+
  geom_linerange(aes(x = name, ymin = value-sd , ymax = value+sd ), width = 0.4 , colour="orange", alpha = 0.9, size = 1.3)

ggplot(data)+
  geom_bar(aes(x= name, y = value), stat ="identity", fill ="skyblue", alpha = 0.5)+
  geom_errorbar(aes(x = name, ymin = value-sd , ymax = value+sd ), width = 0.4 , colour="orange", alpha = 0.9, size = 1.3)+coord_flip()

 

반응형

'Study > R' 카테고리의 다른 글

R-6  (0) 2020.09.05
R-5  (0) 2020.09.05
R-4  (0) 2020.09.05
R -2  (0) 2020.09.05
R-1  (0) 2020.09.02
반응형

R studio에 속성 관리자권한으로 체크해야 권한이 없다고 안 뜬다.

 

dplyr 패키지를 이용한  데이터 전처리

dplyr 로 가공하기

 

airquality->자동완성

 

dplyr

 

obs 행-> 관측칙
variables 변수  독립변수
variables object

 

dim(airquality)
summary(airquality)
str(airquality)

#airquality를 이름바꾸기
air <- airquality
air

summary(air)
str(air)

a = 1
a

 

airquality 를 덥어썼으면 끄고 다시시작하기 기본으로 주는 것은 수정할 수 없다.메모리상에서 객체만 존재

 

#dplyr설치
install.packages("dplyr")
library(dplyr)

 

#dependency 필요한것도 같이 가져와서 설치

glimpse()#

str(air)
glimpse(air)#str q보다 직관적이게 보일수 있다.

 

# air에서 month하고 day로 
air1 <- air[,c(5,6)]
air1

air1 <- air[,c(1,3)]
air1

air1 <- air[,c('Ozone','Wind')]
air1

air1 <- air[,c('Ozone','Wind')]
head(air1)

tail(air1)

#1행부터 20행 까지 
air1 <- air[1:20,]
air1

air1 <- air[,1:4]
air1

air1 <- air[,c(-5,-6)]
air1

colnames(air1)# head이름  열
rownames(air1) #row 이름  행
names(air1)#열이 더 중요하다. 

엑셀보다   csv읽는 이유는 크기가 작아서 
rep->복제 

 

x1 <- 1:20
x2 <- rep(c("a","b"),10)
x2
x3 <- sample(1:200,20) #random 데이터 
x3

 

# 1-50  random 10개
x1 <- sample(1:50,10) #random 데이터 
x1
# 1은 안쓰도  default로 되여있다.
x1 <- sample(50,10) #random 데이터 
x1
# 30-50  random 10개
x1 <- sample(30:50,10) #random 데이터 
x1
# 45-50  random 10개
x1 <- sample(45:50,10,replace = TRUE) #'replace = FALSE' 일때는 모집단보다 큰 샘플을 가질 수 없습니다
x1
# 45-50  random 10개
x1 <- sample(45:50,10,replace = TRUE) #random 데이터 뽑는 것 또 뽑는다. 복원추측

 

x1 <- sample(45:50,10,replace = TRUE) #random 데이터 뽑는 것 또 뽑는다. 복원추측 
x1

set.seed(1234)

x1 <- sample(45:50,10,replace = TRUE) #set.seed하고 조회

x1

 

#airquality에서 153개인데 random으로 15개 끄내기
ari1 <- airquality
index <- sample(153,15)
index
air1 <- air1[index,]
air1

 

air1 <- nrow(airquality)
air1
air1 <- ncol(airquality)
air1

 

index <- sample(nrow(airquality),15)
index

 

vector한개  set안에 여러개 연다.

air1 <- airquality
air1
a <- sample(nrow(air1),15)
a[3]# 3번쨰 것 꺼내기 
dim(air1)[1]

#alt+-누르면 된다. <-

#153개 중에서 70%만큰 샘플링으로 나온다.
index1 <- sample(nrow(air1),nrow(air1)*0.7) 
index1
train <- air1[index,]
test <- air1[-index,]

 

 

ls()
rm(air)
rm(a,A)
ls()
rm(list = ls())->모두 지우기

 

R
help(sample)
prob  a vector of probability weights for obtaining the elements of the vector being sampled. 비율
?sample help와 가능이 같다.

 

RStudio f1

head

 

dplyr 패키지를 이용한  데이터 전처리 \
filter(  ) 행 추출 
select(  ) 열(변수) 추출 
arrange(  ) 정렬 
mutate(  ) 변수 추가 
summarise(  ) 통계치 산출 
group_by(  ) 집단별로 나누기 
left_join(  ) 데이터 합치기(열) 
bind_rows(  ) 데이터 합치기(행) 

 

filter 조건 class가 열 
%>% -> ctrl+shift+m %>% 파이프 연산자  Ctrl + Shift + M

 

library(dplyr)
exam <- read.csv("csv_exam.csv")
exam

exam %>% filter(class == 1)# class가 1인것 
exam %>% filter(class != 1)
exam %>% filter(math > 50)#수학점수가 50보다 크다.
exam %>% filter(english >= 80)
exam %>% filter(class == 1 & math >= 50)

exam %>% filter(class == 1 | english >= 90)

exam %>% filter(class == 1 | class == 3 | class == 5)
exam %>% filter(class %in% c(1,3,5))

class1 <- exam %>% filter(class==1)
mean(class1$math)
air <- airquality
air
air %>% filter(Day>20)#20보다 큰 달을 구한다.
air %>% filter(Day>20) %>% filter(Month == 9)
# 1,3 반중에서 80명 이상되는 분 
exam %>% filter( (class == 1 | class == 3) & english >= 80 ) 

#열 추출
exam %>% select(math)
exam$math

exam %>% select(class,math,english)
select(exam, class)#열의 이름을 가져온다.위에것과 같은 원리이다.

exam %>% select(-math)

가독성을 위해서,
%>%(파이프 연산자)에서 줄을 바꾼다.
Enter를 치면 알아서 들여쓰기가 된다.

 

# class가 1인 english 열만 
exam %>% filter(class == 1) %>% select(english)
exam %>% select(english) %>% filter(class == 1)
#atomic과 리스트 타입들에 대해서만 비교(1)가 가능합니다
#같아 보이는데 조금 다르다.

가독성을 위해서,  %>%(파이프 연산자)에서 줄을 바꾼다. 
 
Enter를 치면 알아서 들여쓰기가 된다. 

dplyr 로 가공하기 

 

exam %>% arrange(math)#order by 오름차순
exam %>% arrange(desc(math))#내림 차순

exam %>% arrange(class,math) # 1순위 class 2순위 math


4. 파생변수 추가하기 & 집단별로 요약하기 
4. 파생변수 추가하기 ->앞의 것에서 열의 의하여 새로운 열을 만든다.
mutate->있는데서 변형하는 것이다.
exam %>% mutate(total = math+english+science) %>% 
  head

exam

 

exam %>% mutate(total = math+english+science,
                mean = (math+english+science)/3) %>% 
        head

exam %>% mutate(test = ifelse(science >= 60),"pass","fial") %>% 
         head 

exam %>% mutate(total = math+ english+science) %>% 
        arrange(total) %>% 
        head

 

요약 통계량 함수

mean(  ) 평균 ->r전체에서
sd(  ) 표준편차 ->r전체에서
sum(  ) 합계 ->r전체에서
median(  ) 중앙값 ->r전체에서
min(  ) 최솟값 ->r전체에서
max(  ) 최댓값 ->r전체에서 
n(  ) 빈도 ->summarize만 같이 있을때만 작동한다.

 

summarise( ): 

summarise() is typically used on grouped data created by group_by().

The output will have one row for each group

summarise(data.frame, functions...) 

수치형 값에 대한 "요약" 통계량을 계산하여 출력한다. 
Center: mean(), median() 
 Spread: sd(), IQR(), mad() 
Range: min(), max(), quantile() 
Position: first(), last(), nth(), 

 

exam %>% summarise(mean_math = mean(math))# 수학평균이 얼마인가

exam %>% 
   group_by(class) %>% 
   summarise(mean_math = mean(math))

exam %>% 
  group_by(class) %>% 
  summarise(sd_math = sd(math))

#class별로 평균하였을 경우 
exam %>% 
  group_by(class) %>% 
  summarise(mean_math = mean(math),
            sum_math = sum(math),
            median_math = median(math),
            n = n()) #학생수 

mpg %>% 
  group_by(manufacturer) %>% 
  filter(class == "suv") %>% 
  mutate(tot = (city+hwy)/2) %>% 
  summarise(mean_tot = mean(tot)) %>% 
  arrange(desc(mean_tot)) %>% 
  head(5)

 

R을 활용한 Data Visualizaition

2일차 데이터 시각화 / 전처리

데이터 시각화 / 전처리 
1. 데이터 시각화의 중요
2.  기본 그래픽- 고수준, 저수준 
R 그래픽 도구 
1.  R 기본 그래픽  (R Base Graphics) 
2.  Lattice Graphics 
3.  ggplot2 
Easy    Fast   Beautiful 
1.  R 기본 그래픽  (R Base Graphics) 
내장되여있어서 설치 필요없음
막대그래프, 히스토그램, 파이그래프 등 여러 시각화 방법을 제공 

별도의 설치 및 호출 필요 없고 가벼움     
설정이 다소 복잡하고 아름답지 못하다는 단점 

 

2.  lattice 
한꺼번에 많은 플롯을 생성할 수 있다.  
다차원의 데이터를 사용하여 변수들갂의 관계를 살펴보는데 유리  
순차적으로 그래프 쌓아가는 것이 어려워 직관적이지 못하다 

 

3.  ggplot2 
이전 두 패키지(R Base Graphics, Lattice Graphics)의 장점만 모아 둔 패키지 
 
갂단한 그래프 문법 + 아름다운 고급그래프 + 레이어로 쌓아감 
 
데이터객체, 그래픽객체로 나눌 수 있어 코드의 재사용성이 높다. 

 

Anscombe’s  quartet 

이산변수 점수 단위로 나누어 측정할 수 있는 변수 막대차트,점그패프,원 차트 등을 이용하여 시각화하면 효과적 
연속변수 시간,길이 등 

고수준 그래픽 함수(high level graphic functions) 
 

plot 함수 
데이트를 x-y평면 상에 출력하는 함수

plot(x, y, type = ‘type value’, main=‘title’, col=color) 
# type : plot의 형태로 점, 선 등을 선택할 수 있다. 
# main: 그래프의 제목 설정 
# col : 그래프의 색상 
Type 옵션 
p : 점(points),  l : 선(lines),  b : 점과 선(both points and lines),  c : b옵션에서 점이 빠짂 모습, 
o : 겹친 점과 선(overplotted),  h : 수직선 ,  s : 수평선 우선의 계단 모양 (steps), 
S : 수직선 우선의 계단 모양 (steps),  n : 배경맊 그리고 출력하지는 않음 (no plotting)  

 

mtcars#r자체에 내장되여있다.
?mtcars
str(mtcars)
names(mtcars)

plot(mtcars)
attach(mtcars)

wt
plot(wt)
mpg
plot(wt, mpg)
plot(wt, mpg, main = "wt와 mpg의 관계")
plot(wt, disp, mpg)#

install.packages("scatterplot3d")
library(scatterplot3d) #package ‘scatterplot3’ is not available (for R version 3.6.1)

scatterplot3d(wt, disp, mpg, pch = 16, main="3D Scatter Plot")
scatterplot3d(wt, disp, mpg, pch = 16, highlight.ed= TRUE, type ="h" , main="3D Scatter Plot")

install.packages("rgl")
library(rgl)
plot3d(wt, disp, mpg)
plot3d(wt, disp, mpg, main="wt Vs mpg Vs disp", col ="red", size="10")

고수준 그래픽 함수(high level graphic functions)

plot() 산점도 출력
barplot() 막대 차트 출력
pie() 파이 차트 출력
matplot() 다중 산점도 출력

 

x11() 

par(mfrow= c(2,3))#multifrow 6개 분할 

plot(0:6,0:6, main ="default")
plot(0:6,0:6, type="b" , main="type = \"b\"")
plot(0:6,0:6, type ="c", main="type =  \"c\"")
plot(0:6,0:6, type ="o", main="type = \"o\"")
plot(0:6,0:6, type="s", main="type= \"s\"")
plot(0:6,0:6, type="S", main="type= \"S\"")

범주형 데이터의 수준별
사용법 barplot(H, width = 1, beside = FALSE, main=‘title’, col=NULL, horiz= …) 

# H (height): 백터나 행렧 입력 가능 (당연히 numeric) 
# beside 인수 : 옆으로 나란히. FALSE 는 누적 
# col : 그래프의 색상 
# horiz= 막대를 평행하게

par(mfrow=c(1,1))
x <- c(38,52,24,8,3)
barplot(x) 막대그래프

par(mfrow=c(1,1))
x <- c(38,52,24,8,3)
barplot(x)

names(x) <- c("Excellent","Very Good","Good", "Fair","Poor")
barplot(x)

y <- scan()
1:  1 2 3 3 4 3 4 1 5 3 3 3 2 4 4
16: 2 4 3 5 3 1 2 3 3 4 4 3 2 3 4
barplot(y)

par(mar=c(2,4,2,2))#여백주는 함수
barplot(table(y), xlab= "Beverage", ylab ="Frequency")
barplot(table(y)/length(y), xlab ="Beverage", ylab = "proportion")
table(y) : 데이터의 도수를 표현 length(y) : 데이터의 갯수(길이) 

객체만들기
sales <- c( 45, 44, 46)
names(sales) <- c("Park", "Kim" ,"Lee")
barplot(sales, main="Sales", ylab ="Thousands")
sales: 데이터 객체 names(sales) : 데이터의 이름 설 정 

범위 조절하기
barplot(sales, main="Sales", ylab ="Thousands" , ylim=c(42,46), xpd=FALSE)
ylim = c(42,46) : y 축 범위 설정 xpd=FALSE : 막대의 벖어남 허용여 부 

pie차트
x
pie(x)

 

names(x) <- c("Excellent","Very Good", "Good","Fair","Poor")
barplot(x)
barplot(x, xlab="수준",ylab="점수")
barplot(x, xlab="수준",ylab="점수", col="blue")
barplot(x, xlab="수준",ylab="점수", col="blue", horiz=TRUE)#범위가 40에서 나갔다.
barplot( x, xlab="수준" , ylab = "점수" , col=c("blue","light blue","red","yellow","grey"), horiz=TRUE)#범위

pie함수
데이터를 파이 차트(원 그래프)로 출력하는 함수

사용법 pie(x, labels = names(x), radius = 0.8, clockwise = FALSE, init.angle = if(clockwise) 90 else 0, density = NULL, angle = 45, col = NULL, …) 

# x : 음수나 0이 아닌 숫자형 벡터  

# labels : 기본값으로 x 벡터의 이름이 사용, 새롭게 지정 가능 

# radius : 파이의 반지름 
# init.angle : 파이 차트가 시작되는 각도(clockwise가 TRUE면 90도 아니면 0도 ) 
# density : 파이 내부의 빗금을 표시하는 밀도 
# angle : 파이 내부의 빗금으 표시하는 기울기 
# col : 파이 내부의 색상 

score <- read.table("score.txt",header= T ,fileEncoding = "UTF-8")
score

score$"성명"

score$"국어"

paste(score$"성명","-",score$"국어")

pie(score$"국어", lables = paste(score$"성명","-",score$"국어"),col=rainbow(10),clockwise=TRUE)

pie(score$"국어", lables = paste(score$"성명","\n","(",score$"국어",")"),col= rainbow(10), clockwise=TRUE)        

install.packages("googleVis")
library(googleVis)

buildcolors <- function(color_count){
  colors <- rainbow(color_count)
  colors <- substring(colors,1,7)
  colors <- paste(colors,collapse = "','")
  colors <- paste("'",colors,"'",sep="")
  colors <- paste("[",colors,"]",sep ="")
  return(colors)
}

cols <- buildcolors(10)

pie <- gvisPieChart(data.frame(score$'성명',score$'국어'),option = list(width = 600, height = 600, title='국어성적',colors=cols,pieSliceText ="label",pieHole="0.5"),chartid="donut")
header <- pie$html$header
header <- gsub("charset=utf-8","charset=euc-kr",header)
pie$html$header <- header
plot(pie)

pie <- gvisPieChart(data.frame(score$"성명",score$"국어"),option = list(width = 600, height = 600, title="국어성적",colors=cols,pieSliceText ="value",pieHole="0.5"),chartid="donut")
header <- pie$html$header
header <- gsub("charset=utf-8","charset=euc-kr",header)
pie$html$header <- header
plot(pie)

paste() : 문자열을 붙이는 함수
clockwise = TRUE : 파이 차트 시작각도 를 90도로 설정

 

\n : 줄바꿈

 

저수준 그래픽 함수(low level graphic functions) 
points() 지정핚 좌표에 점을 찍는 함수 
abline() y=a+bx 의 직선을 그리는 함수 
legend() 범례를 출력하는 함수 
text() 
Plot 영역의 (x,y) 좌표에 문자 를 출력하는 함수 

 

Ex) abline(a, b, lty, col, other options) # y = a + bx abline(h = a, lty, col, other options) # y = a abline(v = b, lty, col, other options) # x = b abline(lm object) # 회귀 직선 
 
# lty : line type으로 1-solid line, 2-dashed line 등 # col : 직선의 색상 

 

cars
cars[1:4,]

#값을 예측하기 위해서 하는 것 
z <- lm( dist ~ speed, data = cars)
summary(z)

x11()
par(mfrow= c(1,1))
plot(cars,main ="abline")


abline(h = 20)
abline(h = 30)

abline(v = 20, col ="blue")

abline(a = 40, b = 4, col='red')

abline(z, lty= 2, lwd= 2, col= 'green')

abline(z$coef, lty= 3, lwd = 2, col='red')

legend()함수

legend(x, y, legend, pch, lty, fill, col, …) x, y : legend를 출력할 위치 지정  ex) x=a, y=b : 좌표 (a,b) 에 범례를 출력 위치를 나타내는 문자사용 ex) ‚topright‛, ‚bottomleft‛, ‚center‛ 등 
 
pch : 점에 대한 범례일 경우, 점을 구분하기 위해 사용 

lty : 선에 대한 범례일 경우, 선의 type을 구분하기 위해 사용 

fill : 면에 대한 범례일 경우, 면의 색상을 구분하기 위해 사용 

pch와 lty 동시 사용 : 점과 선을 동시에 사용한 그래프의 범례 

x11()
plot(1:10, type="n", xlab="",ylab="",main="legend")

legend("bottomright","(x,y)",pch=1,title="bottomright")
legend("bottom","(x,y)",pch=1,title="bottom")
legend("bottomleft","(x,y)",pch=1,title="bottomleft")
legend("left","(x,y)",pch=1,title="left")
legend("topleft","(x,y)",pch=1,title="topleft")
legend("top","(x,y)",pch=1,title="top")
legend("topright","(x,y)",pch=1,title="topright")
legend("right","(x,y)",pch=1,title="right")
legend("center","(x,y)",pch=1,title="center")
legends <- c("Legend1","Legend2")

legend(3,8, legend= legends,pch = 1:2, col = 1:2)
legend(7,8, legend= legends,pch = 1:2, col = 1:2,lty= 1:2)
legend(3,4, legend= legends,fill = 1:2)
legend(7,4, legend= legends,fill = 1:2, density = 30)

연습문제:

x <- c(1,1,1,2,2,2,2,2,2,3,3,4,5,6)
y <- c(2,1,4,2,3,2,2,2,2,2,1,1,1,1)
zz <- data.frame(x,y)
zz
sunflowerplot(zz)
plot(zz)

data(mtcars)
stars(mtcars[,1:4])
stars(mtcars[1:4], flip.labels= FALSE, key.loc = c(13,1.5))

stars(mtcars[1:4], key.loc = c(13,1.5), draw.segments = TRUE)

xx <- c(1,2,3,4,5)
yy <- c(2,3,4,5,6)
zz <- c(10,5,100,20,10)
symbols(xx,yy,xx)

xx <- c(1,2,3,4,5)
yy <- c(20,13,40,50,60)
zz <- c(10,5,100,20,10)
c <- matrix(c(xx,yy,xx),5,3)
c
pairs(c)#컬럼수가 많고 수치형이고 범주형이 그만큼 많을때 

 

persp()#3차원함수

contour()#3차원함수
filled.contour(volcano,color.palette = terrain.colors,asp=1)
title(main="volcano data: filled contour map")

plot(0:6,0:6, type="n", main="type= \"n\"")
plot(0:6,0:6, type="b", lty="dashed")

x <- runif(100)
y <- runif(100)
plot(x,y,pch = ifelse(y > 0.5, 1,18)) #pointcharacter

plot(x,y ,pch = ifelse( y >0.6, 15, ifelse( y > 0.4, 5, 14)))

plot(x,y ,pch = ifelse( y >= 0.7 , 8, ifelse( y>= 0.5, 5, ifelse(y >= 0, 12))))
#kbo.csv ㅇ읽어서 
kbo <- read.csv("kbo.csv")
kbo
#6행보여주기
head(kbo)

#팀별로 정렬하되 알파벳 내림차순 6행까지 출력
kbo %>% group_by(팀) %>% arrange(desc(팀)) %>% head() 

arrange(kbo,desc(팀)) %>% head()

#2017년도의 것만 추출하며 첫 6행 까지 출력하세요 
filter(kbo,연도 == 2017) %>% head()

kbo

#안타,2루타 ,3루타,홈런만 추출하여 첫 6행 까지 출력하세요
select(kbo,안타,X2루타,X3루타,홈런) %>% head()

#2017년도 안타,2루타 ,3루타,홈런첫 5행까지 
filter(kbo,연도 == 2017) %>%  select(  X2루타,X3루타,홈런)%>%   head(5)

kbo

#데이터를 타율(안타/타수)이라는 변수를 넣고 첫 6행 까지 출력하세요 
kbo %>% mutate(타율 = 안타/타수) %>% head()
mutate(kbo, 타율 = 안타/ 타수) %>%  head()
반응형

'Study > R' 카테고리의 다른 글

R-6  (0) 2020.09.05
R-5  (0) 2020.09.05
R-4  (0) 2020.09.05
R-3  (0) 2020.09.05
R-1  (0) 2020.09.02

+ Recent posts