R Conqueror: R programming

레이블이 R programming인 게시물을 표시합니다. 모든 게시물 표시

레이블이 R programming인 게시물을 표시합니다. 모든 게시물 표시

2014년 12월 1일 월요일

비정형 데이터로 다양한 그래프 그리기

        ## 제주도 여행코스를 검색하여 분석후 그래프로 표시하기
        #
        
        setwd("C:\\Users\\user\\Desktop\\R까기")
        getwd()
        
        library(KoNLP)
        library(wordcloud)
        library(RColorBrewer)
        useSejongDic()
        
        mergeUserDic(data.frame("신비의도로", "ncn"))  # 단어 추가
        mergeUserDic(data.frame("주상절리", "ncn"))  # 단어 추가
        
        
        
        # 1. read txt
        
        txt = readLines("data/Part_1/LEVEL_1/jeju.txt")
        head(txt)
        
        typeof(txt)
        
        # 2. extract nuons
        
        txt_nouns = sapply(txt, extractNoun, USE.NAMES=F)
        
        typeof(txt_nouns)
        
        head(txt_nouns)
        
        # 3. unlist for filtering
        
        txt_nouns_unlist = unlist(txt_nouns)
        
        
        # 4. 두 글자 이상되는것만 필터링
        
        place = Filter(function(x){nchar(x) >=2},txt_nouns_unlist)
        
        
        typeof(place)
        
        head(place,60)
        
        # 5. 필요없는 단어 제거
        
        place = gsub("무난","", place)
        place = gsub("전국","", place)
        place = gsub("렌트카","", place)
        place = gsub("\\d+","", place)
        place = gsub("40","", place)
        place = gsub("입장료","", place)
        place = gsub("관광지","", place)
        place = gsub("대략적","", place)
        place = gsub("어디","", place)
        place = gsub("여행","", place)
        place = gsub("숙소","", place)
        place = gsub("도움","", place)
        place = gsub("연휴","", place)
        place = gsub("할인","", place)
        place = gsub("없구요","", place)
        place = gsub("하시","", place)
        place = gsub("6월4일부터","", place)
        place = gsub("되버려서","", place)
        place = gsub("가격","", place)
        place = gsub("질문","", place)
        place = gsub("모바일할인쿠폰을","", place)
        place = gsub("모바일쿠폰을","", place)
        place = gsub("일정","", place)
        place = gsub("예약","", place)
        place = gsub("제주","", place)
        place = gsub("공항","", place)
        place = gsub("해안","", place)
        place = gsub("이용","", place)
        place = gsub("경우","", place)
        place = gsub("전망","", place)
        place = gsub("코스","", place)
        place = gsub("시간","", place)
        place = gsub("추천","", place)
        place = gsub("일출","", place)
        place = gsub("드라이브","", place)
        place = gsub("도착","", place)
        place = gsub("사진","", place)
        place = gsub("가능","", place)
        place = gsub("박물관","", place)
        place = gsub("바다","", place)
        place = gsub("경유","", place)
        place = gsub("소요","", place)
        place = gsub("하루","", place)
        place = gsub("하게","", place)
        place = gsub("바다","", place)
        place = gsub("녹차","", place)
        place = gsub("위치","", place)
        place = gsub("출발","", place)
        place = gsub("다양","", place)
        place = gsub("랜드","", place)
        place = gsub("바다","", place)
        
        
        # 6. save file
        write(unlist(place), "jeju_2.txt")
        
        # 7. read file as table
        
        
        rev = read.table("jeju_2.txt") # list type
        
        typeof(rev)
        
        
        nrow(rev)
        
        # 8. table 형태로 변환해서 wordcount라는 변수에 할당
        wordcount = table(rev)
        head(sort(wordcount, decreasing=T),30) ## 가장 언급 빈도수가 많은순으로 정렬
        
        
        # 9. pie형으로 보이기 상위 10개
        
        a = head(sort(wordcount, decreasing=T),10)
        windows()
        pie(a)
        savePlot("jeju_2.jpg",type="jpg") # save

결과

        # 10. 색상변경
        
        color = rainbow(10)
        pie(a, col=color,radius=1)
        savePlot("jeju_3.jpg",type="jpg") # save

결과

    

        # 11. 수치값 넣기
        
        pct = round(a/sum(a)*100,1)
        
        names(a)
        
        typeof(a)
        
        lab = paste(names(a),"\n",pct,"%")
        lab
        
        pie(a, col=color,radius=1, labels=lab, main="제주도 추천 코스")
        
        par(new=T) ## 겹치기?
        
        pie(a, radius=0.6, labels=NA, border=NA, col="white")
        savePlot("jeju_4.jpg",type="jpg") # save

결과

    

        # 12. bar graph
        
        
        bar = head(sort(wordcount, decreasing=T),10)
        
        barplot(bar, main="제주도 추천 코스 TOP 10", col=color,space=0.8, ylim=c(0,25),cex.name=0.7,las=1 )
        # space : 바 간격, ylim : y축 값, cex.name : x축 사이즈, las : 수직/평형
        
        savePlot("jeju_5.jpg",type="jpg") # save

결과

    
        
        
        
        ##13. 수치넣기
        
        bp = barplot(bar, main="제주도 추천 코스 TOP 10", col=color,space=0.8, ylim=c(0,25),cex.name=0.7,las=1 )
        
        pct_bar = round(bar/sum(bar)*100,1)
        pct_bar
        
        barplot(bar, main="제주도 추천 코스 TOP 10", col=color,space=0.8, ylim=c(0,25),cex.name=0.7,las=1)
        
        text(x=bp, y=bar*1.05, labels=paste("(",pct_bar,"%",")"), col="black", cex=0.7)
        text(x=bp, y=bar*0.95, labels=paste(bar,"건"), col="black", cex=0.7)

        savePlot("jeju_6.jpg",type="jpg") # save

결과

    

        bp_h = barplot(bar, main="제주도 추천 코스 TOP 10", col=color,space=0.8, xlim=c(0,25),cex.name=0.7,las=1,horiz=T)
        
        text(x=bar*1.15, y=bp_h, labels=paste("(",pct_bar,"%",")"), col="black", cex=0.7)
        
        text(x=bar*0.9, y=bp_h, labels=paste(bar,"건"), col="black", cex=0.7)
        
        
        
        savePlot("jeju_7.jpg",type="jpg") # save

결과

2014년 11월 24일 월요일

R round() 반올림 함수

round(x, digits = 0)

x : a numeric vector(실수값)
digits : integer indicating the number of decimal places(round), 반올림된 자리수

* digits = 1 : 소수점 아래 자리
* digits = -1 : 소수점 윗 자리
* digits = 0 : 소수점(원점) 자리

> round(123.456, digits = 1) # 소숫점 아래 첫째자리
[123.5]

> round(123.456, digits = 0) # 소숫점(원점) 자리
[1] 123

> round(123.456, digits= -1) # 소숫점 윗 첫째자리
[1] 120

ex) 100 - 900(단위:100)사이 랜덤 숫자 5개 출력

> round(runif(5, min=100,max=900), digits=-2) # digits = -2 : 둘째 자리까지 표현
[1] 700 500 700 400 300

2014년 11월 20일 목요일

R 그래프 웹에서 그리기(웹시각화 라이브러리:Protovis)

자바스크립트와 svg를 이용하는, 오픈소스기반의 웹시각화 라이브러인 Protovis를 이용

http://mbostock.github.io/protovis/

2014년 11월 19일 수요일

에러 로그 word cloud III

 
## alert log file 분석

## 공백은 '_' 언더바로 변경





 

rm(list=ls(all=T))






## 0. load library



        library(KoNLP)

        library(wordcloud)

        library(RColorBrewer)

        getwd()



## 1. read data

        txt = readLines("data/Part_1/LEVEL_1/alert_log.txt")

        head(txt, 30)



## 2. 공백 => 하이픈 변경

        txt = gsub(" ","_",txt)

        head(txt,30)



## 3. extract Nouns

        txt_nouns = unlist(txt)

        str(txt_nouns)

        txt_nouns = Filter(function(x){ nchar >=15 },txt_nouns) ## 15글자 이상만 필터링

        head(txt_nouns,30)



## 4. save

        write(unlist(txt_nouns),"alert_2.txt")



       

## 5. read table

        rev = read.table("alert_2.txt")

        nrow(rev)

        wordcount = table(rev)

        head(sort(wordcount, decreasing=T),10)



## 6. wordcloud

        windows()

        palete = brewer.pal(9,"Set1")

       

        wordcloud(names(wordcount), feq = wordcount, scale = c(5,0,5), rot.per = 0.25, min.freq = 3, random.order = F, random.color = T, colors = palete) ## 3회 이상만 출력





## 7. savePlot

        savePlot("alert_2.jpg",type="jpg")

결과)

연설문에서 world cloud 생성하기 II

 
## 연설문을 분석해서 word cloud 생성하기

## noh.txt에 저장된 연설문을 분석하여 언급된 단어를 기준으로 워드 클라우드 생성

## 0. load library & setwd()

## 1. read data        

## 2. extract noun

## 3. edit data

## 4. noun nchar >=2

## 5. save

## 6. read data as table

## 7. wordcloud

##

##







 
## 0. load library & setwd()

        library(KoNLP)

        library(wordcloud)

        library(RColorBrewer)

        getwd()

        setwd("C:\\Users\\user\\Dropbox\\ADsP\\R Conqueror")

## 1. read data        

        txt = readLines("data/Part_1/LEVEL_1/noh.txt")

        head(txt)

        class(txt)

        str(txt)

        structure(txt)

## 2. extract noun

        txt_nouns = sapply(txt, extractNoun, USE.NAMES=F)

        head(txt_nouns, 30)

## 3. edit data

        txt_nouns = gsub("\\d+", "",txt_nouns)

## 4. noun nchar >=2

        txt_nouns = unlist(txt_nouns)

        txt_nouns = Filter(function(x) {nchar(x) >=2}, txt_nouns)

       

## 5. save

        write(txt_nouns,"noh_2.txt")

## 6. read data as table

        rev = read.table("noh_2.txt")

        nrow(rev)

        wordcount = table(rev)

        head(sort(wordcount, decreasing=T),30)

       

## 7. wordcloud

        windows()

        palete = brewer.pal(9,"Set1")

        wordcloud(names(wordcount), freq=wordcount, scale=c(5,.5), rot.per= .25, min.freq=1, random.order=F, random.color = T, colors=palete)

결과)

연설문 분석해서 wordcloud 생성하기 I

 

## hong.txt에 저장된 연설문을 분석하여 언급된 단어를 기준으로 워드 클라우드 생성

## 1. 분석용 데이터를 읽어옴

## 2. 불필요한 제거 삭제

## 3. 필요한 단어 추가

## 4. 파일로 저장

## 5. table 형식으로 변환해서 변수 읽어옴

## 6. wordcloud 출력

## 7. save image

##

##

##



getwd() # check working directory

setwd("C:\\Users\\user\\Desktop\\R까기") ## change working directory



# 0. packages load



library(KoNLP)

library(wordcloud)

library(RColorBrewer)



# 1. read data



 txt = readLines("data/Part_1/LEVEL_1/hong.txt") ## read all text lines

 head(txt)



# 2. edit data (delete & add)



         txt = gsub("7","",txt)

  

# 3. extract nouns

         nouns = sapply(txt, extractNoun, USE.NAMES=F)

         nouns_unlist = unlist(nouns)

         head(nouns_unlist,30)



# 3-1. 2글자 이상만 저장

        txt_2 = Filter(function(x) {

              

                nchar(x) >=2

              

              

        },nouns_unlist)

      

        txt_2

        head(txt_2, 30)





# 4. save data

        write(txt_2, "hong_2.txt")





# 5. read data as table

        rev = read.table("hong_2.txt")

        rev

        nrow(rev) # data 행수 확인

      

# 6. wordcloud

        table(rev)

        wordcount = table(rev)

        head(sort(wordcount, decreasing=T),30) # 가장 많이 노출된 단어 확인





# 7. 그래픽 출력

        windows() # 윈도우형태로 출력하기 위해 함수 호출 ; windows()함수 없으면 savePlot() 에서 에러 발생

        palete = brewer.pal(9,"Set1")

        wordcloud(names(wordcount), freq=wordcount, scale=c(5,.5), rot.per= .25, min.freq=1, random.order=F, random.color = T, colors=palete)

결과)

R Filter() ; 두 글자 이상되는것만 필터링 하기

Filter(f, x) # f:function, x: data

두글자 이상 되는 것만 필터링 하기 예

> txt = c("홍길동" "김구" "잉" "a" "aa" "b" "bb" "android")
> txt 
[1] "홍길동" "김구" "잉" "a" "aa" "b" "bb" "android"

> txt  = Filter(function(x){nchar(x) >=2} , txt) # 두글자 이상되는것만 필터링
> txt
[1] "홍길동" "김구" "aa" "android"

## 필터링을 위해 unlist 작업이 선행되어야함 

## list 형태이면 필터링 안됨


> txt =unlist(txt)

R gsub() : 원하지 않는 내용 걸러내기

R gsub()

Usage :
gsub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)

gsub("변경전 글자","변경후 글자",원본데이터)

> gsub("\\d+", "", txt) #숫자제거
> gsub("\\.", "", txt) #점(.) 제거

피드 구독하기: 글 ( Atom )