본문 바로가기

R Programming/Notes

R 프로그래밍(3) - Factor & DataFrame

04 Factor 팩터

1. 문자열 표현 방식

‘The ‘R’ project’                                              

>> Error

“The “R” project”                                            

>> Error

“The ‘R’ project”                                            

>> [1] “The ‘R’ project”

‘The “R” project’                                             

>> [1] “The \’R\’ project”

 

a <- ‘The \’R\’ project’

b <- “The \”R\” project”

a; b                                                                        

>> [1] “The ‘R’ project”

     [1] “The \”R\” project”

print(a)                                                               

>> [1] “The ‘R’ project”

cat(a, ‘\n’)                                                      

>> [1] The ‘R’ project

cat(b, ‘\n’)                                                      

>> [1] The “R” project

 

2. 이스케이프 Escape 문자

a <- ‘a\’b\”c\td\ne’

a                                                                             

>> "a'b\"c\td\ne"      *** ‘ 앞의 \ 만 생략됨

print(a)                                                               

>> "a'b\"c\td\ne"

cat(a)                                                                 

>> [1] a’b”c     d

          e

 

3. Factor

x <- rep(c(“male”, ‘female’), 5)

x                                                                             

>> [1] “male” “female” … “male” “female”

y <- factor(x)

y                                                                             

>> [1] male female … male female

    Levels: female male

class(y)                                                               

>> [1] “factor”

str(y)                                                                   

>> Factor w/2 levels “female”,“male”: 2 1 2 1 2 1 2 1 2 1

 

is.vector(x)                                                       

>> [1] TRUE

is.factor(y)                                                        

>> [1] TRUE

is.character(x)                                                 

>> [1] TRUE

is.character(y)                                             

>> [1] FALSE

summary(x)                                                     

>> Length     Class             Mode

      10           character      character

summary(y)                                                     

>> female  male

         5           5

table(x)                                                              

>> x

    female  male

    5             5

table(y)                                                              

>> y

    female  male

    5             5

cat(x, ‘\n’)                                                       

>> male female … male female

cat(y, ‘\n’)                                                       

>> 2 1 2 1 2 1 2 1 2 1

cat(x, sep = ‘\n’)                                           

>> male

    female

     …

cat(as.vector(y), sep = ‘\n’)                    

>> male

 

4. 수치 벡터

x <- rep(1:3, 3)

x                                                                             

>> [1] 1 2 3 1 2 3 1 2 3

summary(x)                                                     

>> Min.       1st Qu.   Median Mean    3rd Qu.   Max.

        1              1             2           2            3            3

y <- factor(x)                    

y                                                                             

>> [1] 1 2 3 1 2 3 1 2 3

       Levels: 1 2 3

str(y)                                                                   

>> Factor w/3 levels “1”,”2”,”3”: 1 2 3

summary(y)                                                     

>> 1          2        3

     3          3        3

 

5. 함수 levels

levels(x)                                                                             

>> NULL

levels(y)                                                                             

>> [1] “1” “2” “3”

levels(rep(c(“male”, ‘female’), 5))                         

>> NULL

levels(factor(rep(c(“male”,”female”), 5)))               

>> [1] “female” “male”

 

names(x) <- LETTERS[1:length(x)]

x                                                                             

>> A B C D E F G H I

      1 2 3 1 2 3 1 2 3

str(x)                                                                   

>> Named int [1:9] 1 2 3 1 2 3 1 2 3

      -attr(*, “names”) = chr [1:9] “A” B” …

levels(x)                                                             

>> NULL

names(x)                                                           

>> [1] “A” “B” “C” “D” “E” “F” “G” “H” “I”

names(y)                                                           

>> NULL

 

6. 팩터 수준 순서 및 값 편집

1)      변수 선언

x <- sample(c(“low”, “mid”, “high”), 10, replace = T, prob = c(0.2, 0.5, 0.3))

x                                                                     

>> [1] “mid” “low” “mid” “mid” “mid” “mid” …

summary(x)                                             

>> Length  Class        Mode

      10       character  character

table(x)                                                      

>> x

   high    low      mid

      1       1           8

y <- factor(x)

y                                                                     

>> [1] mid low mid mid mid mid …

    Levels: “high” “low” “mid”

str(y)                                                            

>> Factor w/3 levels: "high","low","mid": 3 2 3 3 …

summary(y)                                              

>> high     low       mid

       1         1            8

table(y)                                                       

>> x의 table과 동일한 형태로 출력

 

2)      값 편집

y <- factor(x, levels = c(“low”, “mid”, “high”))

y                                                                     

>> [1] mid low mid mid mid mid high mid mid mid

   Levels: low mid high

str(y)                                                           

>> Factor w/3 levels “low”, “mid”, “high”: 2 1 2 2 2 …

summary(y)                                             

>> low mid high

     1       8       1

x[10] <- “high”                                        

>> 10번째 값 수정

y[10] <- “high”                                        

>> 10번째 값 수정

x[10] <- “높음”                                         

>> 10번째 값 수정

y[10] <- “높음”                                         

>> Warning Message(NA generated)

y                                                                     

>> [1] mid low mid … <NA>

    Levels: low mid high

y <- factor(y, levels = c(levels(y), “very.high”))

y[10] <- “very.high”

y                                                                     

>> [1] mid low mid mid … very.high

    Levels: low mid high very.high

 

y <- factor(y, levels = c(“low”, “mid”, “high”, “very.high”),

labels = c(“Small”, “Medium”, “Large”, “Huge”))

 

y                                                                     

>> [1] Medium Small Medium Medium … Huge

     Levels: Small Medium Large Huge

levels(y)[1:3] <- c(“low”, “mid”, “high”)

y                                                                     

>> [1] mid low mid mid … Huge

    Levels: low mid high Huge

levels(y) <- c(“low”, “not.low”, “not.low”, “not.low”)

y                                                                     

>> [1] not.low low not.low low …

    Levels: low not.low

 

7. 함수 as.factor

a <- sample(1:3, 20, replace = T)

str(a)                                                                   

>> int [1:20] 2 2 3 3 1 1 2 1 1 2 …

summary(a)                                                     

>> Min. 1st Quo …

     …

b <- as.character(a)

str(b)                                                                   

>> chr [1:20] “2” “2” “3” …

summary(b)                                                     

>> Length Class Mode

    …

c <- as.factor(a)

str(c)                                                                   

>> Factor w/3 levels “1” “2” “3”: 2 2 3 3 1 1 2 …

summary(c)                                                      

>> 1 2 3

     7 6 7

 

05 DataFrame 데이터프레임 기본속성

1. 데이터프레임 생성과 구성요소

1) 벡터를 이용한 생성

a <- c(‘홍길동’, ‘홍길순’, ‘홍길자’)

b <- c(80, 100, 70)

c <- c(60, 50, 70)

DATA <- data.frame(row.names = a, 국어 = b, 영어 = c)

DATA                                                           

>>                  국어        영어

    홍길동          80          60

    홍길순         100         50

    홍길자          70          70

2) 새로운 열 추가

DATA <- data.frame(DATA, 평균 = (b + c) / 2, 합계 = b + c)

DATA                                                           

>>                        국어        영어        평균        합계

   홍길동               80           60           70        140

                                  …

3) 함수 str

str(DATA)                                                  

>> ‘data.frame’: 3 obs of 4 variables:

   $ 국어: num 80 100 70

    …

   $ 합계: num 140 150 140

class(DATA)                                             

>> [1] “data.frame”

4) 구성요소

DATA$국어                                                 

>> [1] 80 100 70

str(DATA$국어)                                       

>>  num [1:3] 80 100 70

rownames(DATA)                                  

>> [1] “홍길동” “홍길순” “홍길자”

colnames(DATA)                                   

>> [1] “국어” “영어” “평균” “합계”

str(rownames(DATA))                        

>> chr [1:3] “홍길동” “홍길순” “홍길자”

 

2. 인덱싱

DATA$국어[1]                                                  

>> [1] 80

DATA$국어[2:3]                                              

>> [1] 100 70

DATA$국어[c(1, 3)]                                       

>> [1] 80 NA

DATA[1, 3]                                                        

>> [1] 홍길동의 평균점수

DATA[1:2, 4]                                                    

>> [1] 홍길동과 홍길순의 합계

DATA[c(1, 3), c(3, 4)                                    

>> 홍길동, 홍길자의 평균, 합계를 표로 출력

DATA[c(1, 3), ]                                               

>> 홍길동, 홍길자의 국어, 영어, 평균, 합계 출력

DATA[, 3:4]                                                      

>> 홍길동, 홍길순, 홍길자의 평균, 합계 출력

3. 값의 편집

DATA$영어[2] <- 70                                      

>> 홍길순의 영어 점수가 70으로 편집됨

DATA$영어[2:3] <- c(70, 80)

DATA[2:3, 1] <- c(70, 80)                          

>> 홍길순, 홍길자의 국어 점수가 70, 80으로 편집됨

rownames(DATA)[1] <- ‘김갑동’              

>> 홍길동의 이름이 김갑동으로 편집됨

rownames(DATA)[2:3] <- c(‘김철수’, ‘김영희’)

 

4. 조건 검색

DATA$영어 [DATA$영어 > 60]                    

>> [1] 70 80

DATA$영어 [DATA$영어 > 60, 1:3]            

>>                국어        영어        평균

    김철수       70          70           75

    김영희       80          80           70

DATA$영어 [DATA$영어 > 60, ]                  

>>                국어        영어        평균        합계

    김철수      70           70           75         150

    김영희      80           80           70         140

 

5. 정렬

df                                                                           

>>                 국어        영어        평균

      김갑동      80           61         70.5

      김철수      95           50         72.5

      김영희      80           73         76.5

df$국어                                                                

>> [1] 80 95 80

sort(df$국어)                                                    

>> [1] 80 80 95

sort(-df$국어)                                                 

>> [1] -95 -80 -80

order(df$국어)                                                 

>> [1] 1 3 2

order(-df$국어)                                              

>> [1] 2 1 3

order(df$국어, decreasing = T)                 

>> [1] 2 1 3

df$국어[order(df$국어)]                         

>> [1] 80 80 95

df$국어[sort(df$국어)]                            

>> [1] NA NA NA

 

df[order(df$국어), ]                                       

>> 김갑동, 김영희, 김철수 순으로 바뀌어 dataframe 출력

df[order(-df$국어), ]                                    

>> 김철수, 김갑동, 김영희 순으로 바뀌어 dataframe 출력
df[order(df$국어, -df$영어), ]                   

>> 김영희, 김갑동, 김철수 순으로 출력

 

df[order(rownames(df)), ]                        

>> 김갑동, 김영희, 김철수 순으로 출력

df[order(-rownames(df)), ]                     

>> Error

df[order(rank(rownames(df))), ]           

>> 김갑동, 김영희 김철수 순으로 출력

df[order(-rank(rownames(df))), ]        

>> 김철수, 김영희, 김갑동 순으로 출력

 

 

6. 컬럼 삽입, 삭제 / 행수, 열수

1) 컬럼 삽입

df[‘합계’] = df[‘국어’] + df[‘영어’]

str(df)                                                         

>> ‘data.frame’: 3 obs. of 4 variables:

     $국어: num 80 95 80

       …

     $ 합계: num 141 145 153

2) 컬럼 삭제

df[‘평균’] = NULL

df[c(‘국어’, ‘영어’)] = NULL                  

>> 합계만 남음

 

3) 데이터프레임 행수 열수

nrow(df)                                                    

>> [1] 3

ncol(df)                                                      

>> [1] 4

length(df)                                                  

>> [1] 4

dim(df)                                                       

>> [1] 3 4

 

7. 데이터프레임과 문자열, 팩터

x <- sample(c(“low, “mid”, “high”), 10, replace = T)

y <- factor(x)

data <- data.frame(num = 1:10, pos1 = x, pos2 = y)

head(data, 3)                                                   

>>                 num       pos1      pos2

    1                1             low         low

    2                2             mid        mid

    3                3             mid        mid

str(data)                                                            

>> “data.frame”: 10 obs. of 3 variables

    $num: int 1 2 3 4 …

    $pos1: chr “low” “mid” “mid” …

    $pos2: Factor w/3 levels “high”, “low”, “mid”: 2 3 …

data[order(data$pos1), ]                           

>>                 num       pos1      pos2

      5             5             high       high

    10            10            high       high

      1             1             low         low

      …

data[order(-data$pos1), ]                        

>> Error

data[order(-data$pos2), ]                        

>> 1 2 3 4 5 … 순으로 출력,

Warning Message(- is not meaningful for factors)

                data[order(-rank(data$pos2)), ]           

>> mid, low, high 순으로 차례대로 출력
data[order(data$pos2, decreasing = T), ]               

>> mid, low, high 순으로 차례대로 출력

06 DataFrame 데이터프레임 파일 다루기

1. 함수 scan

1) 수치 데이터

scan(file = ‘…’, what = numeric())                          

>> Read 10 items

   [1] 1.0 2.0 3.0 … 10.0

scan(file = ‘…’, what = numeric(), sep = ‘ ‘)                 

>> Read 10 items

   [1] 1.0 2.0 3.0 … 10.0

scan(file = ‘…’, what = numeric(), sep = ‘\n’)           

>> Error

   *** got 6.57.28910

2) 문자열 데이터

a b c d e

“f g h i j”

 

scan(file  = ‘…’, what = character())                       

>> Read 6 items

   [1] “a” “b” “c” “d” “e” “f g h i j”

scan(file = ‘…’, what = character(), quote = NULL)      

>> Read 10 items

   [1] “a” “b” “c” “d” “e” “\”f” “g” …

scan(file = ‘…’, what = “char”, sep = ‘\n’)                           

>> Read 2 items

   [1] “a b c e d” “\”f g h I j\””

scan(file = ‘…’, what = “”, sep = ‘\n’, quote = NULL)

>> [1] 위와 같은 결과

                                                                               

2. 함수  cat, write

data <- scan(file = ‘…’, what = “char”, quote = NULL)

cat(data, file = “out.txt”, sep = ‘\n’)                                     

>> 작업폴더에 출력됨

write(data, file = “out.txt”, sep = ‘\n’)                                 

>> 위와 같은 결과

 

3. 함수 readLines, writeLines

readLines(‘파일이름’)                                                                    

>> [1] “a b c d e” “\”f g h i j\””

writeLines(readLines(‘파일이름’), ‘out.txt’)                               

>> 작업폴더에 위와 같은 결과가 줄로 출력됨

 

* Scan vs ReadLines

Readline은 NULL line까지 출력함( “”)

Scan은 NULL line을 제외하고 출력함

 

4. 벡터의 빈도표 만들기

Freq.TEXT <- table(TEXT)

head(Freq.TEXT)                                                                           

>> TEXT

     “environment”  (easily)    (formerly

             1                  1               1

       …

class(Freq.TEXT)                                                                           

>> [1] “table”

str(Freq.TEXT)                                                                                

>> ‘table’ int [1:281(1d)] 1 1 1 1 …

    -attr(*, “dimnames”) = List of 1

    ..$ TEXT: chr [1:281] “\”environment\”” …

Freq(TEXT) <- sort(table(TEXT), decreasing = T)       

>> 내림차순 정렬

head(names(Freq.TEXT))                                                         

>> [1] “and” “a” “of” “is” …

head(unname(Freq.TEXT))                                                      

>> [1] 27 18 18 14 14 14

head(as.vector(Freq.TEXT))                                                    

>> 위와 같은 결과

 

*** 연습문제: Freq.TEXT를 알파벳 내림차순으로 정렬하여 Freq2 만들기

Freq2 <- Freq.TEXT[sort(names(Freq.TEXT), decreasing = T)]

 

· 빈도표를 데이터프레임으로 변환

Freq.Data <- data.frame(Freq.TEXT)

head(Freq.Data)                                                            

>>            TEXT     Freq

     1          and        27

     2          a            18

     3          of           18

     …

Freq.Data <- data.frame(row.names = names(Freq.TEXT), Freq = unname(Freq.TEXT))

head(Freq.Data)                                                            

>>            Freq

   and        27

     a          18

    …

· 데이터프레임에 상대빈도 열 추가

Freq.Data[‘Rel.Freq’] = round(Freq.Data$Freq/sum(Freq.Data$Freq), 3)

head(Freq.Data)                                                            

>>            Freq       Rel.Freq

    and        27           0.056

     a           18           0.037

    …

 

5. 함수 write.table

write.table(Freq.Data, file = “Freq.txt”, quote = F, sep = ‘\t’, col.names = NA)               

>> 작업폴더에 출력

 

6. 함수 read.delim

df <- read.delim(file = ‘Freq.txt’, sep = ‘\t’, header = T, row.names = 1, quote = NULL)

head(df)                                                                             

>>            Freq       Rel.Freq

    and        27           0.056

     a           18           0.037

    of           18           0.037

    …

str(df)                                                                                 

>> ‘data.frame’: 281 obs. of 2 variables:

    $Freq: int 27 18 18 14 …

    $Rel.Freq: num 0.056 0.037 …