3.1 Data Wrangling - 数据类型,新建,摘要

https://dgoretzko.github.io/slv/practicals/01_Data_Wrangling/01_Data_Wrangling_Answers.html

Data types

#install #library #class

install.packages() #安装包
library() #加载包
install.packages("tidyverse")
library(tidyverse)

class() #查看数据类型

class(cats$Sex)
levels(cats$Sex)
object_7 <- as.numeric(object_7) #转换数据类型

Lists and data frames

#object #dataframe #function #bind #binddataframe

#新建object
objects <- list(object_1, object_2, object_3, object_4, object_5, object_6, 
                object_7) 

#新建dataframe
dat <- data.frame(Var1 = object_1, Var2 = object_2, Var3 = object_5)
dat
#设定dataframe的size
ncol(dat)
nrow(dat)

#更详细版本的新建dataframe
Create a data frame called balance_df with 3 columns and 500 rows: student always 0, balance ranging from 0 to 3000, and income always the mean income in the default_train dataset.
#方法1
balance_df <- tibble(
  student = rep(0, 500),
  balance = seq(0, 3000, length.out = 500),
  income  = rep(mean(default_train$income), 500)
)
#方法2
n_rows <- 500
balance_values <- seq(0, 3000, length.out = n_rows)
mean_income <- mean(default_train$income)
balance_df <- data.frame(
  student = rep(0, n_rows),
  balance = balance_values,
  income = rep(mean_income, n_rows)
)
str(balance_df)

#把两个dataframe合在一起,添加一个variable指明数据是来源于哪个dataframe
elastic <- bind_rows("Elastic1" = elastic1,
                     "Elastic2" = elastic2,
                     .id = "Set")


#新建函数(见第四周)
mad <- function(x) {
  median(abs(x - median(x)))
}
students_dataset %>% summarise(mad = mad(grade))

Loading, viewing, and summarising data

#read #tidyr #gather #spread #seperate #unite

#读取在线csv文件,保存成variable,查看数据集的前几行
con <- url("https://www.gerkovink.com/slv/practicals/01_Data_Wrangling/data/googleplaystore.csv")
apps <- read_csv(con)
head(apps)

#读取本地xlsx文件
con <- read_xlsx("data/con.xlsx")
head(con)
tail(con)
View(con) #可以让dataset像在excel里显示一样
summary(con)

#表格行和列安排混乱,重新安排表格
library(tidyr)
gather(cases, "year", n, 2:4)
spread()
seperate() #把年月日一栏分成年、月、日三栏
unite() #三栏合成一栏

Data transformation with dplyr verbs

#dplyr #filter #arrange #select #mutate #recode #summarize #group_by

dplyr cheatsheet1.4 Cheatsheets

#需要先加载tidyverse package dplyr
library(tidyr)

#filter-过滤数据
filter(students, grade < 5.5)
filter(students, grade > 8, programme == "A")
filter(cats, Bwt > 2, Bwt < 2.2, Sex == "F")
#arrange-排列dataset
arrange(students, programme, -grade)
#select-选择呈现部分dataset
select(students, student_number, programme)
#mutate-通过计算已有variables来新建variables
students <- mutate(students, pass = grade > 5.5)
students
#recode-重命名columns
students_recoded <- mutate(students, 
  programme = recode(programme, "A" = "Science", "B" = "Social Science")
)
#group_by()
#summarize()

Data processing pipelines

#pipeline

#对同一个dataset进行多个步骤的操作
popular_apps <-
  read_csv("data/googleplaystore.csv") %>% 
  mutate(Downloads = parse_number(Installs)) %>% 
  filter(Downloads > 5e8) %>% # 5e8 is the same as 5 x 10^8
  arrange(-Rating) %>% 
  select(App, Rating, Reviews, Downloads, Category) %>% 
  distinct(App, .keep_all = TRUE)
popular_apps

Grouping and summarisation

#summarize

#自定义需要summarize的内容
students_dataset %>% 
  group_by(prog) %>%  #optional,按照哪个类别分类,出来会是这几类
  summarise(
    mean = mean(grade), 
    variance = var(grade), 
    med = median(grade),
    min = min(grade), 
    max = max(grade)
  )
#自行选择两个类别进行summary并分析
read_csv("data/googleplaystore.csv") %>% 
  filter(Category == "GAME" | Category == "COMMUNICATION") %>% 
  select(App, Category, Rating) %>% 
  distinct(App, .keep_all = TRUE) %>% 
  group_by(Category) %>% 
  summarise(
    mean = mean(Rating, na.rm = TRUE),
    median = median(Rating, na.rm = TRUE)
  )