Frequency Plot

Gutenberg

Hilary February 4, 2018

library(gutenbergr)
## Warning: package 'gutenbergr' was built under R version 3.4.3
library(dplyr)
## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.4.3
library(stringr)
#to get an Excel file of all the works in Project Gutenberg
write.csv(x = gutenberg_works(), file = "works.csv")

#want Treasure Island
gutenberg_metadata %>% filter(title == "Treasure Island") #120
## # A tibble: 3 x 8
##   gutenberg_id           title                  author gutenberg_author_id
##          <int>           <chr>                   <chr>               <int>
## 1          120 Treasure Island Stevenson, Robert Louis                  35
## 2        23936 Treasure Island Stevenson, Robert Louis                  35
## 3        27780 Treasure Island Stevenson, Robert Louis                  35
## # ... with 4 more variables: language <chr>, gutenberg_bookshelf <chr>,
## #   rights <chr>, has_text <lgl>
#want Wuthering Heights
gutenberg_metadata %>% filter(title == "Wuthering Heights") #768
## # A tibble: 1 x 8
##   gutenberg_id             title        author gutenberg_author_id
##          <int>             <chr>         <chr>               <int>
## 1          768 Wuthering Heights Brontë, Emily                 405
## # ... with 4 more variables: language <chr>, gutenberg_bookshelf <chr>,
## #   rights <chr>, has_text <lgl>
#Want Adventures of Sherlock Holmes
gutenberg_works(str_detect(title, "Adventures of Sherlock Holmes")) #1661
## # A tibble: 2 x 8
##   gutenberg_id                                        title
##          <int>                                        <chr>
## 1         1661            The Adventures of Sherlock Holmes
## 2        48320 "Adventures of Sherlock Holmes\nIllustrated"
## # ... with 6 more variables: author <chr>, gutenberg_author_id <int>,
## #   language <chr>, gutenberg_bookshelf <chr>, rights <chr>,
## #   has_text <lgl>
#want Taxidermy w/o a teacher
gutenberg_works(str_detect(title, "Taxidermy without")) #51439
## # A tibble: 1 x 8
##   gutenberg_id
##          <int>
## 1        51439
## # ... with 7 more variables: title <chr>, author <chr>,
## #   gutenberg_author_id <int>, language <chr>, gutenberg_bookshelf <chr>,
## #   rights <chr>, has_text <lgl>
#want Great expectations
gutenberg_metadata %>% filter(title == "Great Expectations") #1400
## # A tibble: 2 x 8
##   gutenberg_id              title           author gutenberg_author_id
##          <int>              <chr>            <chr>               <int>
## 1         1400 Great Expectations Dickens, Charles                  37
## 2         8608 Great Expectations Dickens, Charles                  37
## # ... with 4 more variables: language <chr>, gutenberg_bookshelf <chr>,
## #   rights <chr>, has_text <lgl>
#want Adventures of Huck Finn
gutenberg_works(str_detect(title, "Huckleberry Finn")) #76
## # A tibble: 10 x 8
##    gutenberg_id                                                     title
##           <int>                                                     <chr>
##  1           76                            Adventures of Huckleberry Finn
##  2         7100         Adventures of Huckleberry Finn, Chapters 01 to 05
##  3         7101         Adventures of Huckleberry Finn, Chapters 06 to 10
##  4         7102         Adventures of Huckleberry Finn, Chapters 11 to 15
##  5         7103         Adventures of Huckleberry Finn, Chapters 16 to 20
##  6         7104         Adventures of Huckleberry Finn, Chapters 21 to 25
##  7         7105         Adventures of Huckleberry Finn, Chapters 26 to 30
##  8         7106         Adventures of Huckleberry Finn, Chapters 31 to 35
##  9         7107   Adventures of Huckleberry Finn, Chapters 36 to the Last
## 10        32325 The Adventures of Huckleberry Finn (Tom Sawyer's Comrade)
## # ... with 6 more variables: author <chr>, gutenberg_author_id <int>,
## #   language <chr>, gutenberg_bookshelf <chr>, rights <chr>,
## #   has_text <lgl>
#download books
ti <- gutenberg_download(120)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org
wh <- gutenberg_download(768)
sh <- gutenberg_download(1661)
tx <- gutenberg_download(51439)
gh <- gutenberg_download(1400)
hf <- gutenberg_download(76)

#word counts

tidy_ti <- ti %>% unnest_tokens(word, text) %>% anti_join(stop_words)
## Joining, by = "word"
ti.count <- tidy_ti %>% count(word, sort = TRUE)

word.count <- function(s){
  name <- paste0("tidy_",s)
  tib <- s %>% unnest_tokens(word, text) %>% anti_join(stop_words)
  name.count <- paste0(s,".count")
  name.count <- tib %>% count(word, sort = TRUE)
  return(nc = name.count[1:10,])
}

w1 <- word.count(ti)
## Joining, by = "word"
w2 <- word.count(wh)
## Joining, by = "word"
w3 <- word.count(sh)
## Joining, by = "word"
w4 <- word.count(tx)
## Joining, by = "word"
w5 <- word.count(gh)
## Joining, by = "word"
w6 <- word.count(hf)
## Joining, by = "word"
library(ggplot2)
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.4.3

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine
countplot2 <- function(w, t, tzeva){
  ggplot(data = w, aes(x = reorder(word,n), y = n)) + geom_col(fill = tzeva) + 
    labs(x = "10 Most Frequent Words", y = "Count", title = t) + coord_flip()
}

p1 <- countplot2(w1, "Treasure Island", "royalblue2")
p2 <- countplot2(w2, "Wuthering Heights", "slategray")
p3 <- countplot2(w3, "The Adventures of Sherlock Holmes", "seagreen")
p4 <- countplot2(w4, "Taxidermy without a Teacher", "chocolate3")
p5 <- countplot2(w5, "Great Expectations", "magenta4")
p6 <- countplot2(w6, "The Adventures of Huckleberry Finn", "firebrick1")

grid.arrange(p1,p2,p3,p5,p4,p6, ncol = 2)