R-6 :: NAIAHD

R-6

2020. 9. 5. 14:14

728x90

install.packages("nycflights13")
library(nycflights13)
data(package = "nycflights13")
4개의 data set이 있다.
Data sets in package ĄŽnycflights13ĄŻ:

airlines                            Airline names.
airports                            Airport metadata
flights                             Flights data
planes                              Plane metadata.
weather                             Hourly weather data

install.packages("nycflights13")
library(nycflights13)
data(package = "nycflights13")

flights
#행의 개수

dim(flights)
str(flights)
nrow(flights)
flights #tibble 이여서 몇개만 보여주고 마지막에 개수 보여준다.
summary(flights)#행의 개수 안보임

fly1 <- flights
summary(fly1)
fly1 <- as.data.frame(fly1)
summary(fly1)#행의 개수  class를 보기

#flights 데이터셋에서 열이 이름(변수명)
colnames(flights)
names(flights)

#1월 1일 데이터는 모두 몇개 입니까 ?
library(dplyr)
flights %>% filter(month == '1' & day == '1' ) %>% nrow

flights

#도착 지연 시 이 2시간 이하인 항공편은 모두 몇 회입니까 ?
flights %>% filter(arr_delay < 120) %>% nrow

#출발시간과 도착시간이 모두 1시간 이상 지연된 항공편은 모두 몇 회입니까?
flights %>% filter(dep_delay >= 60 & arr_delay >= 60) %>% nrow

#filter(dep_delay >= 60 & arr_delay >= 60)->flight,origin,dest,arr_delay,dep_delay,distance,air_time조회해서 데이터셋 만들기 
flights_new <- flights %>% filter(dep_delay >= 60 & arr_delay >= 60) %>% select(flight,origin,dest,arr_delay,dep_delay,distance,air_time ,arr_time)
flights_new

#도착지연은 오름차순 출발지연은 내림차누
flights_new %>% arrange(arr_delay,desc(dep_delay))#앞에것 하고 동급 있을때 내림차순으로 한다.

#gain speed구하기 
flights_new %>% mutate( gain = arr_delay- dep_delay, speed = distance/air_time )

flights %>% nrow#336776
flights <- na.omit(flights)#na없에기 
flights %>% nrow#327346
flights %>% summarize(delay= mean(dep_delay), delay_sd = sd(dep_delay))

#flights에서 12개월가 월별 평균출발시간(dep_time)을 구하세요
flights
flights %>% group_by(month) %>% summarise(dep_mean = mean(dep_time))
#'mean(dep_time)' 이름을 지정하지 않았다.


flights %>% distinct(month)
str(flights)
summary(flights)
summary(airquality)
library(ggplot2)
summary(mpg)

크롤링이란   (Crawling)  ->데이터 전처리
크롤링(Crawling)이란?
- 스크레이핑(Scraping)이라고도 하며 웹 페이지를 그대로 가져와서 거기서 데이터를 추출해내는 행위이다.
크롤링(Crawling)은 불법일까?
- 크롤링하여 얻은 데이터를 개인 하드에 소장하는 것까지는 합법이다.
하지만 배포하면 그 순갂부터 합법과 불법이 갈린다.
- 합법적인 크롤링은 사이트 운영자의 의사에 반하지 않는 것이다.
- 불법적인 크롤링은 사이트 운영자의 의사에 반하거나 실정법을 어기는 것이다.
- 크롤링은 웹페이지의 소스가 포함되어 있다. 이런 소스들은 웹프로그래밍 저작물이며
이를 불법 복제하는 것은 위법이다.
크롤링(Crawling)을 위한 기본 개념을 알아보자.
- 서버 : 외부에서 요청하면 규칙대로 정보를 제공하는 역할을 한다.
- 브라우저 : 서버가 주는 것들을 사용자에게 보여준다.
크롤링(Crawling)을 위한 기본 개념을 알아보자.
- 웹 서버는 text(html, css, js 등)와 image를 브라우저를 통해 사용자에게     보여준다
준비물

1. 구글크롬 2. Selector Gadget 3. 엑셀

• 방법

1. 구글크롬을 설치한다.

2. https://selectorgadget.com/에서 들어가서 북마크에 설치한다.

SelectorGadget: point and click CSS selectors

SelectorGadget:point and click CSS selectors SelectorGadget Screencast from Andrew Cantino on Vimeo. SelectorGadget is an open source tool that makes CSS selector generation and discovery on complicated sites a breeze. Just install the Chrome Extension or

selectorgadget.com

3. 다음에 들어간다.
4. 검색 키워드로 서치한다.
5. 원하는 컨테츠를 선택한다.(ex. 뉴스, 블로그, 카페)
6. 2페이지에 갔다가 다시 1페이지로 돌아온다.

install.packages("rvest")
library(rvest)
install.packages("stringr")
library(stringr)

title = c()#빈상태에서 접여여기
body = c()

#주소가 자동으로 바꾸게 넣어줘야 한다.
url_base ="https://search.daum.net/search?w=news&q=%EA%B5%AD%ED%9A%8C&DA=PGD&spacing=0&p="
url_crawl = paste(url_base);#paste는 문자 열 둘개를 붙이는 함수 
# url_crawl = paste(url_base, i )#url주소와 i를 붙여라 
print(url_crawl,1)
hdoc = read_html(url_crawl)
hdoc

for(i in 1:10){#i가 1부터 진행
  #url_crawl = paste(url_base, i , sep ="");#paste는 문자 열 둘개를 붙이는 함수 
  url_crawl = paste(url_base, i , sep ="");
 # url_crawl = paste(url_base, i )#url주소와 i를 붙여라 
  print(url_crawl)
  #t_css = ".f_link_b"#소제목으로 되여있다.F12로 하면 볼수 있다. .은 class를 표현하는 것이다.
  #b_css = ".desc"#
  #ie에서 F12로 하면 볼수 있다. .은 class를 표현하는 것이다
  #구글 selectorGadget에서 볼수 있다.
  
  hdoc = read_html(url_crawl)#그 경로에 있는 것을 읽어온다.가져오는 것
  
  
  #t_node = html_nodes(hdoc, t_css)#사진  ,text중 다르다.판단하는 함수가 
  #b_node = html_nodes(hdoc, b_css)
  t_node = html_nodes(hdoc, ".f_link_b")
  b_node = html_nodes(hdoc, ".desc")
  
  title_part = html_text(t_node)#글자를 가져오는 함수 달려가서 가져와야 한다.
  body_part = html_text(b_node)
  
  title = c(title,title_part)
  # title = c() vector형태로 되여있다.
  body = c(body,body_part)
}
news = cbind(title,body)
news
write.csv(news,"crawltest.csv")

텍스트마이닝
빈도분석 • 감성분석 • 한국어처리 KoNLP
rjAVA는 한글에서 필요하는것
RjAVA있어야 한

library(KoNLP)
library(wordcloud2)

useSejongDic()

text = readLines("textMining\\ahn.txt")
text

nouns <- extractNoun(text)#명사만 뽑아내는 것
nouns
#F1extract Nouns from Korean sentence uses Hannanum analyzer
#F2함수 내용  HannanumObj한글 페키지 
#"그렇습니다. 미래는 지금 우리 앞에 있습니다. " ->"미래" "우리" "앞"

nouns <- unlist(nouns)#LIST아니게 바꾼다.->1차원형태로 VECTOR형태로
nouns

nouns <- nouns[nchar(nouns)>=2]#nouns안에 있는 글자의 개수가 2보다 큰것 
nouns#nouns 2글자 이상인것만 한글자 지웠다.

#빈도를 새는것 
wordFreq <- table(nouns)
wordFreq <- sort(wordFreq, decreasing = T)#숫자가 많이 나온것을 정렬
wordFreq <- head(wordFreq, 20)
wordFreq

wordFreq <- table(nouns) %>% sort(decreasing = T) %>% head(20)
wordFreq
wordcloud2(wordFreq,fontFamily = '맑은 고딕')

useSejongDic()
nouns <- readLines("textMining\\leesungman.txt",encoding = "UTF-8") %>% extractNoun() %>% unlist()
nouns <- readLines("textMining\\leesungman.txt") %>% extractNoun() %>% unlist()
nouns <- nouns[nchar(nouns)>= 2]
nouns
wordFreq <- table(nouns) %>% sort(decreasing = T) %>% head(20)
wordcloud2(wordFreq,fontFamily = '맑은 고딕')
#jeonduhwan.txt
#kimdaejung.txt
#leesungman.txt 'textMining\leesungman.txt'에서 불완전한 마지막 행이 발견되었습니다
#roh.txt parkjunghee1.txt,leesungman.txt->그냥하면 된다.

#가장 많이 사용된 단어 알아보기 
txt = readLines("news.txt")
head(txt)

library(stringr)
extractNoun("대한민국의 영토는 한반도와 그 부속서로 한다.")
nouns <- extractNoun(txt)

wordcount <- table(unlist(nouns))
df_word <- as.data.frame(wordcount,stringsAsFactors = F)

df_word <- rename(df_word, word= Var1, freq = Freq)
df_word <- filter(df_word,nchar(word)>=2)

top20 <- df_word %>% arrange(desc(freq)) %>% head(20) 
top20

#useNIAdic
txt <- str_replace_all(txt,"//w","") #정규표현식

빈도가 많이 쓰여진다.
감성 분석
웹 사이트와 소셜미디어에 나타난 소비자의 감성을 붂석하여 유용한 정보로 재가공하는 기술을 의미한다.
문장에서 사용된 단어의 긍정과 부정의 빈도에 따라 문장의 긍정, 부정을 평가한다.
사람이 작성한 텍스트 앆에는 그 글의 주요 대상이 되는 주제(Topic)와 주제에 대한 글쓴이의 의견(Opinion)이 있다.
감성 붂석은 주제에 대한 글쓴이의 의견을 파악하는 것으로 Opinion Mining이라고도 한다

list.of.packages <- c("","","","")
new.packages <- list.of.packages()

install.packages("twitteR")
install.packages("ROAuth")
install.packages("plyr")
install.packages("stringr")
library(twitteR)
library(ROAuth)
library(plyr)
library(stringr)

#키
API_key <- ""
API_secret <- ""
access_token <- ""
access_secret <- ""

#object로 할수 있다. save(df_midterm,file="df_midterm.rda")->1일 날 것
#load("df_midterm.rda")

setup_twitter_oauth(counsumer_key = API_key, consumer_secret = API_secret,access_token = access_token,access_secret = access_secret)


#전 세계에서 
install.packages("ggmap")
library(ggmap)
library(ggplot2)

html.airports <- read_html("https://en.wikipedia.org/wiki/List_of_busiest_airports_by_passenger_traffic")
html.airports
df <- html_table(html_nodes(html.airports, "table")[[1]],fill= T) #[[]]list중에서 첫번째것 
#html_table 표 가지로 가는 것 
#html_nodes 노드 가지로 가는 것 

head(df)
head(df$Airport)
colnames(df)
colnames(df)[6] <- "total" #있으면 보여주고 없으면 만들어준다.
df$total
df$total <- gsub(',','',df$total)#정규표현식 할떄 나오는 것 g는 global하는 것 
#전부다 대체 하는 것이다. ,를 찾아서 ''으로 대체 한다.
df$total <- as.numeric(df$total)#character->숫자로 바꾸는 것 
df$total

Github에 있는 twitteR 패키지를 설치하기 위해 devtools 패키지를 설치한다.
rjson, bit64, httr은 의존성으로 함께 설치가 필요하다.
ROAuth 패키지는 트위터의 권한을 얻기 위해 설치가 필요하다.

* 패키지 설치 후 library() 로 로드한다. twitter 패키지는 install_github() 함수를 사 용하며
username은 자싞의 Desktop 사용자 이름으로 정의한다.

* 앞서 설명했던 API_key, API_secret, access_token, access_secret 을 복사 붙여넣기 한다.

* 저장했던 4개의 객체를 옵션으로 넣고 함수를 실행한다.

* searchTwitter() 함수를 이용하여 @apple 를 검색한 자료를 가져온다. since, until로 검색 기갂을 설정하고, lang로 언어를 설정할 수 있다.

* 500개를 설정했지만 실제 자료는 300개가 출력됐다. 해당 키워드에 일치되는 결과가 300개이기 때문

유투브에서 보기
3blue1brown
cs230
cs231n

knitr::kable(anscombe)
anscombe.1 <- data.frame(x = anscombe[["x1"]], y = anscombe[["y1"]], Set = "Anscombe Set 1")
anscombe.2 <- data.frame(x = anscombe[["x2"]], y = anscombe[["y2"]], Set = "Anscombe Set 2")
anscombe.3 <- data.frame(x = anscombe[["x3"]], y = anscombe[["y3"]], Set = "Anscombe Set 3")
anscombe.4 <- data.frame(x = anscombe[["x4"]], y = anscombe[["y4"]], Set = "Anscombe Set 4")
anscombe.data <- rbind(anscombe.1, anscombe.2, anscombe.3, anscombe.4)
aggregate(cbind(x, y) ~ Set, anscombe.data, mean)
aggregate(cbind(x, y) ~ Set, anscombe.data, sd)
model1 <- lm(y ~ x, subset(anscombe.data, Set == "Anscombe Set 1"))

model2 <- lm(y ~ x, subset(anscombe.data, Set == "Anscombe Set 2"))

model3 <- lm(y ~ x, subset(anscombe.data, Set == "Anscombe Set 3"))

model4 <- lm(y ~ x, subset(anscombe.data, Set == "Anscombe Set 4"))
library(plyr)

correlation <- function(data) {
  
  x <- data.frame(r = cor(data$x, data$y))
  
  return(x)
  
}

ddply(.data = anscombe.data, .variables = "Set", .fun = correlation)

summary(model1)

summary(model2)

summary(model3)

summary(model4)

library(ggplot2)
gg <- ggplot(anscombe.data, aes(x = x, y = y))

gg <- gg + geom_point(color = "black")

gg <- gg + facet_wrap(~Set, ncol = 2)

gg <- gg + geom_smooth(formula = y ~ x, method = "lm", se = FALSE, data = anscombe.data)

gg

#package있는지 없는지
install.packages("datasauRus")
library(datasauRus)
if(requireNamespace("dplyr")){
  suppressPackageStartupMessages(library(dplyr))
  datasaurus_dozen %>% 
    group_by(dataset) %>% 
    summarize(
      mean_x    = mean(x),
      mean_y    = mean(y),
      std_dev_x = sd(x),
      std_dev_y = sd(y),
      corr_x_y  = cor(x, y)
    )
}
if(requireNamespace("ggplot2")){
  library(ggplot2)
  ggplot(datasaurus_dozen, aes(x=x, y=y, colour=dataset))+
    geom_point()+
    theme_void()+
    theme(legend.position = "none")+
    facet_wrap(~dataset, ncol=3)
}
#reshap2가 없을때 
if(!require(reshape2)){
  install.packages("reshape2")
  require(reshape2)
}

비지도학습-군집분석
1.군집분석이란?
2.k-means
3.적정 k의 값
k는 숫자이다.
분류 vs 군집화
분류: 범주의 수 및 각 개체의 범주 정보를 사전에 알 수 있으면, 입력 변수 값으로부터 범저 정보를 유추하여 새로운 개체에 대해 가장
적합한 범주로 할당하는 문제(지도 학습)
군집화(clustering) 군집의 수 ,속성 등이 사전에 알려져 있지 않으면 최적의 구분을 찾아가는 문제(비지도 학습)
k는 숫자이다.
#회귀문제

if(!require(caret)){
  install.packages("caret")
  require(caret)
}

data("iris")
View(iris)

set.seed(123)
inTrain <- createDataPartition(y = iris$Species, p = 0.7, list = F)
training <- iris[inTrain,]
testing <- iris[-inTrain,]
training
#표준화
training.data <- scale(training[-5])#데이터 형태 맞춘다.
training.data
summary(training.data)

iris.kmeans <- kmeans(training.data[,-5],center = 3, iter.max = 10000)
iris.kmeans$centers

의사결정 나무
의사결정나무 정형데이터 나무 시리즈
다양한 도형을 분류하는 의사 결정 나무
검정색 찾을 때 까지 얼마나 가야 만 찾을 수 있는지

if(!require(NbClust)){
  install.packages("NbClust")
  require(NbClust)
}
#값이 정해진 것이 아니다.맘데로 정할 수 있다.
nc <- NbClust(training.data, min.nc = 2, max.nc = 15, method = "kmeans")

barplot(table(nc$Best.n[1,]),xlab ="Number of Clusters",ylab ="Number of chosen",main="Number of Clusters chosen")

training.data <- as.data.frame(training.data)#레ㅌ프트 생성한다.
modFit <- train(x = training.data[,-5],y = training$cluster, method="rpart")
#의상결정으로 분류하는 것  rpart
#rpart 반복적으로 20번씩 계속 해주는 것 
#분집을 나누고 의사결정 모데을 만들어서 예측해서 
#정답하고 결과가 얼마나 차이나는 가 
testing.data <- as.data.frame(scale(testing[,-5]))
testingclusterPred <- predict(modFit, testing.data)
table(testingclusterPred,testing$Species)

entropy 순도를 높여가는 양식
다른 변수인 소득을 기준으로 정렬하고 다시 같은 작업을 반복
entropy낮은 기준에서 나눈다.
순도를 높이기 위해 계속 나눈다.

계층적 방법
dendrogram 계층적분석 방법이다. ggplot에서

knn분석

ribbon
#ribbon
huron <- data.frame(year = 1875 : 1972 , level = as.vector(LakeHuron))
huron
ggplot(data = huron, aes(x= year))+geom_area(aes(y = level))
p <- ggplot(data = huron, aes(x = year))
p <- p+geom_area(aes(y = level))
p + coord_cartesian(ylim = c(570,590))#limit줘서 해줘야 한다.


p <- ggplot(data = huron, aes(x= year))
p <- p+geom_area(aes(y = level))
p +coord_cartesian(ylim = c(min(huron$level)-2,max(huron$level)+2))

 
p <- ggplot(huron,aes(x= year))+geom_ribbon(aes(ymin= min(level)-2,ymax = level+2))

p <- ggplot(huron, aes(x= year)) +geom_ribbon(aes(ymin = min(level)-2, ymax = level+2))

p <- ggplot(huron,aes(x = year,fill="skyblue"))+geom_ribbon(aes(ymin = min(level)-2, ymax = level+2,fill="skyblue"),color="skyblue")

p <- ggplot(huron, aes(x=year,fill="skyblue"))
p + geom_ribbon(aes(ymin=level-2, ymax=level+2 ,fill="skyblue"), colour="blue")

knn ->거리를 계싼하는 것
knn(k-Nearest Neighbor)새로운 돌아온 데이터와 그룹의 데이터와 가장 가까우니 새로운것과 재일 가까운 것은 그룹이다.->분류 알고리즘

k 인접 이웃 분류
overfitting 완전 최적화 데이터 under fit이 안되도록 학습할때 잘 안될 수 있다.
일반화 유지 그래서 적절한 것을 구해야 한다.
유클리드 거리 Euclidean distance L2거리 직선거리
맨하탄 거리 직선이 아닐때 L1거리

'Study > R' 카테고리의 다른 글

R-8 (0)	2020.09.05
R-7 (0)	2020.09.05
R-5 (0)	2020.09.05
R-4 (0)	2020.09.05
R-3 (0)	2020.09.05

NAIAHD

R-6

'Study > R' 카테고리의 다른 글

+ Recent posts

티스토리툴바