R-4 :: NAIAHD

R-4

2020. 9. 5. 13:59

728x90

library(dplyr)
library(ggplot2)
mpg
data1 <- mpg %>% filter(displ <= 4) %>% summarise(mean(hwy))
data1
data2 <- mpg %>% filter(displ >= 5) %>% summarise(mean(hwy))
data2 

data <- mpg %>% filter( displ <= 4 | displ >= 5) %>% mutate(group1 = ifelse(displ <= 4, 'a', ifelse(displ>=5,'b','c'))) %>% group_by(group1) %>% summarise(mean(hwy))
data

mpg %>% filter()

#mpg는 위에  10개만 보여준다.
head(airquality, 20)
head(mpg, 30) #강제로 10개만 보여준다.

#데이터 타입으로 바꾸는것 as.data.frame
airquality <- as_tibble(airquality)
airquality

mpg

mpg %>% filter(manufacturer=='audi' | manufacturer=='ford') %>% group_by(manufacturer) %>% summarise(test=mean(cty)) %>% arrange(desc(test)) 

mpg_new <-mpg %>% mutate(total = (hwy+cty)/2) %>% arrange(desc(total)) %>% head()        
d2 <-mpg %>% mutate(total = (hwy+cty)/2) %>% arrange(desc(total))  %>% head()  
d2 <- as.data.frame(d2)
arrange(d2,desc(total))

mpg_new <- mpg %>% select('class','cty')
mpg_new

mpg_new %>%  filter(class =='suv' | class =='compact') %>% group_by(class) %>% summarise(sum(cty)) 

mpg %>% select('class','cty') %>%  filter(class =='suv' | class =='compact') %>% group_by(class) %>% summarise(mean(cty))  
mpg %>% filter(class =='suv' | class =='compact')  %>% select('class','cty')  %>% group_by(class) %>% summarise(mean(cty)) 

mpg %>% filter(class =='compact') %>% group_by(manufacturer) %>% summarise(tot=n()) %>% arrange(desc(tot))

mpg데이터셋 보기
ggplot

str->str(mpg)

tbl_df , tbl and data.frame
tbl data.frame비슷하는데 계산하기 위해서 나오는 것
tbl_df
chr m num, int등으로 되여있다.
열의 이름을 보고 싶으면 names(mpg)
names(mpg)[8] #cty만 뜨게끔 몇번째 명을 지정한다.
str(mpg)
names(mpg)
names(mpg)[8] #cty만 뜨게끔 몇번째 명을 지정한다.

names(mpg)[8] <- 'city'
names(mpg)
mpg
names(mpg)[8] <- 'cty'
names(mpg)
mpg

dplyr :: glimpse(mpg)
mpg
names(mpg)[12]
mpg %>% select(c(names(mpg)[12] ))

mpg <- subset( mpg, select = -c(names(mpg)[12] ) )
mpg

mpg <- ggplot2::mpg
mpg
ggplot(data= mpg,aes(x= displ,y=hwy))+geom_point()

ggplot(mpg,aes(displ,hwy,colour = class))+geom_point()

ggplot(mpg, aes(displ,hwy))+geom_point((aes(colour ="blue")))# 바닥에 색갈이 었으면 두개 층이 겹쳐져서 원하는 색갈을 못가진다.레이어가 겁쳐서나오기  overwritting
ggplot(mpg, aes(displ,hwy))+geom_point(colour ="blue")#동급에서 색갈이 먹여지는데 

ggplot(mpg, aes(displ,hwy,colour = class))+geom_point() #색갈갈
ggplot(mpg, aes(displ,hwy,colour = trans))+geom_point()#
ggplot(mpg, aes(displ,hwy,colour = drv))+geom_point()#
ggplot(mpg, aes(displ,hwy,colour = cty))+geom_point()#

#shape도형형
ggplot(mpg, aes(displ,cty, shape=drv))+geom_point()
ggplot(mpg, aes(displ,cty, shape=class))+geom_point()
ggplot(mpg, aes(displ,cty, shape=trans))+geom_point()
ggplot(mpg, aes(displ,cty, shape=cty))+geom_point()#A continuous variable can not be mapped to shape

#size #많을 수록 많아진다.
ggplot(mpg, aes(displ, cty, size= cty)) +geom_point()
ggplot(mpg, aes(displ, cty, size= trans)) +geom_point()

ggplot(mpg, aes(displ, cty, size= cty)) +geom_point(colour ="red")
ggplot(mpg, aes(displ, cty, size= cty)) +geom_point(colour= cty)#객체 'cty'를 찾을 수 없습니다
ggplot(mpg, aes(displ, cty, size= cty)) +geom_point(aes(colour= cty))

#만약 size와 color를 다르게 주면 어떤 그림을 그려 낼까요 
ggplot(mpg, aes(displ ,cty, size = cty , color= drv))+geom_point()

ggplot(mpg,aes(cty,hwy))+geom_point()#점선


str(diamonds)
ggplot(diamonds,aes(carat,price))+geom_point()
ggplot(economics, aes(date, unemploy))+geom_line()#선으로 
ggplot(mpg,aes(cty))
ggplot(mpg,aes(cty))+geom_histogram()#1차원으로 하는 것것


ggplot(mpg,aes(cty))+geom_histogram(bins=10)#더 굵어진다.
ggplot(mpg,aes(cty))+geom_histogram(bins=20)#bins는 막대기 그림림

dia <- diamonds #이름 바꾸기 
class(dia)
dia
#ord order 순서가 정해진다.
#carat 크기 모양

#범주유형으로 색상 하기 
ggplot(diamonds,aes(carat,price, color=cut))+geom_point()
ggplot(diamonds,aes(carat,price, color=color))+geom_point()
ggplot(diamonds,aes(carat,price, color=clarity))+geom_point()

#r과 rstudio 속도 관련문제
r은 훨씬 빠르고 부드럽게 된다.

#facetting
#따로따로 보여주는 것
#범주용 데이터에 대하여

ggplot(mpg, aes(displ,hwy))+geom_point()+facet_wrap(~class)

geom_smooth()
ggplot(mpg,aes(displ,hwy))+geom_point()+geom_smooth()#곡선 범위 
ggplot(mpg,aes(displ,hwy))+geom_point()+geom_smooth(method="loess")#곡선 defaule  local지역을 쪼개서 연결하는 상황
ggplot(mpg,aes(displ,hwy))+geom_point()+geom_smooth(method="lm")#직선 범위  lw linear model 선형모델 

geom_boxplot()

#gitter 흩어주는 것인데 geom_violin()더 예쁘지는 것이다. 절반짤라서 
#geom_violin
ggplot(mpg, aes(drv, hwy)) +geom_violin()
ggplot(mpg,aes(drv,hwy))+geom_jitter()
ggplot(data = mpg, aes(x = drv, y = hwy))+
  geom_point(size = 2, position = "jitter")

geom_feqploy()
ggplot(mpg, aes(hwy)) +geom_freqpoly()#histogram그리고 그다음 그리는 것 
ggplot(mpg, aes(hwy)) +geom_freqpoly(bins = 20)#범위가 넓어진다.

geom_histogam()
ggplot(mpg, aes(displ,color= drv)) +geom_histogram(bindwidth= 0.5)#색상이 안된다.
ggplot(mpg, aes(displ,fill= drv)) +geom_histogram(bindwidth= 0.5)#
ggplot(mpg, aes(displ,fill= drv)) +geom_histogram(bindwidth= 0.5,position = "dodge")#한줄에 색사이 하나밖에 없다.


#geom_bar
#자주사용하는 것이다.
ggplot(mpg, aes(displ,fill= drv)) +geom_bar(position = "dodge")
ggplot(mpg, aes(displ,fill= drv)) +geom_bar(position = "fill")

ggplot(mpg, aes(manufacturer))+geom_bar()#변수가 하나일때는 stat안하면 자동으로count로 된다.
drugs <- data.frame(drug = c("a","b","c"), effect = c( 4, 9, 6))
ggplot(drugs, aes(drug, effect))+geom_bar(stat = "identity")#y값이 정해져있다.identity로무조건 설정해야 한다.
ggplot(drugs, aes(drug, effect))+geom_bar()#stat_count() must not be used with a y aesthetic.

ggplot(economics, aes(date, unemploy / pop))+geom_line()#알아서 계싼해서 만들어준다.
ggplot(economics, aes(date, unemploy))+geom_line()

ggplot(mpg, aes(drv, hwy)) +geom_boxplot()#박스가 25%에서 선을 거고 50% 선을 거 위에 25%에 선이 있다. 중앙값 
#잴 위에 있는 것은 이상값 동그라미 


mpg %>% filter(hwy < 20 & drv == 'f') 
mpg %>% filter(hwy < 25 & drv == 'f') %>% arrange(hwy)

데이터 프레임 합치기
결측치 not avaliable
데이터 정제하기[결측치] 데이터 비여있기

#na이냐 아니냐 
df <- data.frame(sex= c("M","F",NA,"M","F"),score=c(5,4,3,4,NA))
df
is.na(df)
table(is.na(df))

table(is.na((df$sex)))
table(is.na(df$score))

#na빼고 계산하면 안되기때문에 na로 되여있다.
mean(df$score)#[1] NA ->수학계산을 알수없다.
sum(df$score)#[1] NA->수학계산을 알수없다.

df %>% filter(is.na(score))#na인것만 골라낸다.
df %>% filter(!is.na(score))#na가 아닌것을 골라낸다.

#해결법은 아닌것만 모아서 계산하겠다.
df_nomiss <- df %>% filter(!is.na(score)) 
df_nomiss
#평균은 모두합쳐서 ,na빼고 , error
mean(df_nomiss$score) 
sum(df_nomiss$score)

df_nomiss <- df %>% filter(!is.na(score) & !is.na(sex))
df_nomiss

#na.omit생략한다. 지운다. na가보이면 그 행을 지운다. 
#na가 모이면 그 행을 삭제한다. 그래서 사용하면 안된다. 위험한 행위이다.
#다만 데이터가 엄청 많을때 무슨 영향을 미치는 지 알고 있을때 가능하다.
#위험한 행위이다.
df_nomiss2 <- na.omit(df)#모든 변수에 결측치 없는 데이터 추출
df_nomiss2

#아래것은 가장 권장한 결과이다.
#na가  있다는 것을 알고 있기에 원래 데이터가 수정되는 것이 아니여서 안정성이 있다.
mean(df$score, na.rm =  T)#결측치 제외하고 평균산출 
sum(df$score, na.rm = T)#결측치 제외하고 합계 산출
#결론은 어떻게 하냐 ? 입력 is.na na.rm na.omit
#결측치를 지우고 한것이다.

데이터 정제하기 [이상치] 이상한 이상이다.
이상치의 가장대표적인 예는 로또 1등이다. 분포에 끝에 있다는 것이지 나쁘는 것은 아니다.

outlier <- data.frame(sex= c(1,2,1,3,2,1) , score= c(5,4,3,4,2,26))
outlier

table(outlier$sex)
table(outlier$score)

outlier$sex <- ifelse(outlier$sex == 3, NA, outlier$sex)#3은 이상치보다 오류이다.
outlier

outlier$score <- ifelse(outlier$score>5 , NA, outlier$score)#16
outlier

outlier <- data.frame(sex= c(1,2,1,3,2,1) , score= c(5,4,3,4,2,26))
boxplot(outlier$score)

outlier %>% filter(!is.na(sex) & !is.na(score)) %>% 
  group_by(sex) %>% 
  summarise(mean_score = mean(score))

정규표현식
함수
인수가 없는 함수

#함수 ->변수선언할 필요없다.
#함수 ->

Minho <- function(){
  x <- 10
  y <- 20
  return (x*y)#돌려주는 값의 선언
}

ls()
Minho
Minho()

#인수가 있는 함수의 선언
Minho2 <- function(x,y){
  xx <- x
  yy <- y
  return (sum(xx,yy))#돌려주는 값의 선언
  #시스템이 정의한 특정 함수를 이용한 결과를 돌려줌
}

Minho2(2,3)

kaggle 
gitub

Minho3 <- function(x,y){
  x3 <- x+1
  y3 <- y+1
  x4 <- Minho2(x3, y3)#함수에서 함수를 부르는 경우 재귀호출이 가능하다.
  return(x4)
}
Minho3(2,4)

#결과를 화면에 반환하지 않고 변수에 할당
Minho4 <- function(){
  x <-  10
  y <-  10
  return(invisible(x*y)) # 결과값은 보여지지 않지만 변수에는 값이 들어간다.
}
Minho4()
result <- Minho4()
result

#함수 외부의 변수를 조작해야
rm(x)
x <- 70 #시스템 변수 x에 70을할당 
ls()
minho5 <- function(){
  x <- 10#함수내에서 사용하는 변수 
  y <- 20#함수내에서 사용하는 변수 
  
  x <<- 40 #시스템에서 사용하는 변수  x에 40을 할당
  return(x+y)
}
minho5()

minho5 <- function(){
  x <- 20#함수내에서 사용하는 변수 
  y <- 20#함수내에서 사용하는 변수 
  x+y
}

#return이 우선순위고 return이 없으면 마지막으로 된다.
minho5 <- function(){
  x <- 20#함수내에서 사용하는 변수 
  y <- 20#함수내에서 사용하는 변수 
  return(x+y)
  x-y
}

minho5()
minho5

sum1 <- 0 #변수를 설정
for(i in seq(1,10,by =1 )) sum1 <- sum1+i#1에서 10을 1단위로 차례로 넣고 결과 확인
sum1

sum1 <- 0
for(i in 1:5){
  for(j in 1:5){
    sum1 <- sum1 + i*j
  }
}
sum1

sum1 <- 0
for(i in 1:5){
  for(j in 1:5){
    sum1 <- sum1 + i*j
  }
}
sum1

sum1 <- 0 #변수를 설정
for(i in seq(1,10,by =1 )) sum1 <- sum1+i#1에서 10을 1단위로 차례로 넣고 결과 확인
sum1

sum1 <- 0
for(i in 1:5){
  for(j in 1:5){
    sum1 <- sum1 + i*j
  }
}
sum1

while문
sum2 <- 0
i <- 1
while(i <= 10){
  sum2 <- sum2+i;
  i <-i+1
}
sum2

#repeat
sum3 <- 0
i <- 1
repeat{
  sum3 <- sum3+i
  i <- i+1
  if(i>10) break#이 조건에 맞으면 탈출한다.
}
sum3 <- 0
i <- 0
repeat{
  if(i> 5) break
  j <- 0
  repeat{
    if(j > 5)break
    sum3 <- sum3+i*j
    j <- j+1
  }
  i <- i+1
}
sum3

sum1 <- 0
len <- 10
for(i in 1:len){
  sum1 <- sum1 + i
}
sum1

서포트 벡터 머신
데이터 성격 이해 ->탐색적 데이터 (ggplot2,baseR graphic,플롯 , 페어)->변수를 피처엔저니어링(필요한것만 추가 파생변수 등 만든다. 뉴테이트 변수를 빼거나 넣거나 하는것 )
->분석(예측,분류,CNN등)
회귀 최소제곱법 찾는 방법
  단순회귀 예측 연속성
  로지스트회귀 양자택 sigmoid함수 ->
  다중회귀
전문가시스템
의사결정에서의 시리즈

kaggle 알고리즘 -> 분석 의사결정시리즈

vector점이라고 생각하면 된다.
중간에는 decition boundary
margin
support vetors
선형분류 문제 ->데이터를 분류하는 선형 결정경계(Decstion boudary)
분류가능하도록 데이터를 변화시킨다.
함수로 해서 변화한다. 변형을 시킨다.
비선형 분류
소프트 마진
hypperparameter 로 튜닝하다.

playground.tensorflow.org/#activation=tanh&batchSize=10&dataset=circle&regDataset=reg-plane&learningRate=0.03&regularizationRate=0&noise=0&networkShape=4,2&seed=0.47074&showTestData=false&discretize=false&percTrainData=50&x=true&y=true&xTimesY=false&xSquared=false&ySquared=false&cosX=false&sinX=false&cosY=false&sinY=false&collectStats=false&problem=classification&initZero=false&hideText=false

Tensorflow — Neural Network Playground

Tinker with a real neural network right here in your browser.

playground.tensorflow.org

playground tensorflow

RBF(radial basis function)비선형 분류-> 잘 모르는 RBF
RBF(radial basis function)가우시안 커널
서포트 백터 머신의 장단점
장점
1. 분류문제나 회귀문제 동시에 쓸 수 있다.
2.사용하기 쉽다.
3. 예측의 정확도가 높다
단점
1.커널 hypperparameter 등을 위해 튜닝 과정이 필요하다.
2.선형 회귀/로지스틱 회귀와 다르게 신뢰 구간등을

svm 지원 library
e1071
kabR
kerlab

신경망은 rgb함수로
kernel 선형 아니면 마법사처럼 위에 올라갈것인지
분류가지고 예측한다.
데이터

#분류

install.packages("RCurl")
# load the library
library(RCurl)
# specify the URL for the Iris data CSV
urlfile <-'http://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data'
# download the file
downloaded <- getURL(urlfile, ssl.verifypeer=FALSE)
# treat the text data as a steam so we can read from it
connection <- textConnection(downloaded)
# parse the downloaded data as CSV
letters <- read.csv(connection, header=FALSE)
# preview the first 5 rows
colnames(letters)<-c("letter","xbox", "ybox","width","height",
                     "onpix","xbar","ybar","x2bar","y2bar",
                     "xybar","x2ybar","xy2bar","xedge","xedgey",
                     "yedge","yedgex")
View(letters)
str(letters)
head(letters)
write.csv(letters,"letters.csv")
#머신러닝은 기계로 배운다.

 test_split = 0.2
train_size = round((dim(letters)[1] *(1- test_split)))#20000* -0.8 test는 4만계
set.seed(20180621)#seed 값에 따라사  sample을 돈다.
train_index = sample(1:(dim(letters)[1]),train_size) #1~ 20000, 16000

letters_train <- letters[train_index,]
letters_test  <- letters[-train_index,] #train한것 나머지를 구한다.

install.packages("e1071")
library(e1071)#svm 지원하는 것 
#데이터 , 문자로 맞추고 변수는 그 나머지 모든 것
#fitting svm with linear kernel
letters_linear <- svm(letter~. , data = letters_train,kernel ="linear")
summary(letters_linear)

Call:
svm(formula = letter ~ ., data = letters_train, kernel = "linear")

Parameters:
   SVM-Type:  C-classification
SVM-Kernel:  linear
       cost:  1

Number of Support Vectors:  7147

( 225 456 371 133 484 436 213 209 234 183 313 202 239 287 235 298 346 236 240 335 240 196 175 280 458 123 )

Number of Classes:  26

Levels:
A B C D E F G H I J K L M N O P Q R S T U V W X Y Z

선형이여서 감마 값이 없다. 얼마나 멀리 있는지 가까이 있는지

Ubuntu ->windows에서 linux사용가능하다.

!hostname
#나를 위해서 만들어진 서버
7f22e97478ea

'Study > R' 카테고리의 다른 글

R-6 (0)	2020.09.05
R-5 (0)	2020.09.05
R-3 (0)	2020.09.05
R -2 (0)	2020.09.05
R-1 (0)	2020.09.02

NAIAHD

R-4

'Study > R' 카테고리의 다른 글

+ Recent posts

티스토리툴바