3.1 Data Wrangling - 数据类型,新建,摘要
https://dgoretzko.github.io/slv/practicals/01_Data_Wrangling/01_Data_Wrangling_Answers.html
Data types
#install #library #class
install.packages() #安装包
library() #加载包
install.packages("tidyverse")
library(tidyverse)
class() #查看数据类型
class(cats$Sex)
levels(cats$Sex)
object_7 <- as.numeric(object_7) #转换数据类型
Lists and data frames
#object #dataframe #function #bind #binddataframe
#新建object
objects <- list(object_1, object_2, object_3, object_4, object_5, object_6,
object_7)
#新建dataframe
dat <- data.frame(Var1 = object_1, Var2 = object_2, Var3 = object_5)
dat
#设定dataframe的size
ncol(dat)
nrow(dat)
#更详细版本的新建dataframe
Create a data frame called balance_df with 3 columns and 500 rows: student always 0, balance ranging from 0 to 3000, and income always the mean income in the default_train dataset.
#方法1
balance_df <- tibble(
student = rep(0, 500),
balance = seq(0, 3000, length.out = 500),
income = rep(mean(default_train$income), 500)
)
#方法2
n_rows <- 500
balance_values <- seq(0, 3000, length.out = n_rows)
mean_income <- mean(default_train$income)
balance_df <- data.frame(
student = rep(0, n_rows),
balance = balance_values,
income = rep(mean_income, n_rows)
)
str(balance_df)
#把两个dataframe合在一起,添加一个variable指明数据是来源于哪个dataframe
elastic <- bind_rows("Elastic1" = elastic1,
"Elastic2" = elastic2,
.id = "Set")
#新建函数(见第四周)
mad <- function(x) {
median(abs(x - median(x)))
}
students_dataset %>% summarise(mad = mad(grade))
Loading, viewing, and summarising data
#read #tidyr #gather #spread #seperate #unite
#读取在线csv文件,保存成variable,查看数据集的前几行
con <- url("https://www.gerkovink.com/slv/practicals/01_Data_Wrangling/data/googleplaystore.csv")
apps <- read_csv(con)
head(apps)
#读取本地xlsx文件
con <- read_xlsx("data/con.xlsx")
head(con)
tail(con)
View(con) #可以让dataset像在excel里显示一样
summary(con)
#表格行和列安排混乱,重新安排表格
library(tidyr)
gather(cases, "year", n, 2:4)
spread()
seperate() #把年月日一栏分成年、月、日三栏
unite() #三栏合成一栏
Data transformation with dplyr
verbs
#dplyr #filter #arrange #select #mutate #recode #summarize #group_by
dplyr cheatsheet1.4 Cheatsheets
#需要先加载tidyverse package dplyr
library(tidyr)
#filter-过滤数据
filter(students, grade < 5.5)
filter(students, grade > 8, programme == "A")
filter(cats, Bwt > 2, Bwt < 2.2, Sex == "F")
#arrange-排列dataset
arrange(students, programme, -grade)
#select-选择呈现部分dataset
select(students, student_number, programme)
#mutate-通过计算已有variables来新建variables
students <- mutate(students, pass = grade > 5.5)
students
#recode-重命名columns
students_recoded <- mutate(students,
programme = recode(programme, "A" = "Science", "B" = "Social Science")
)
#group_by()
#summarize()
Data processing pipelines
#pipeline
#对同一个dataset进行多个步骤的操作
popular_apps <-
read_csv("data/googleplaystore.csv") %>%
mutate(Downloads = parse_number(Installs)) %>%
filter(Downloads > 5e8) %>% # 5e8 is the same as 5 x 10^8
arrange(-Rating) %>%
select(App, Rating, Reviews, Downloads, Category) %>%
distinct(App, .keep_all = TRUE)
popular_apps
Grouping and summarisation
#summarize
#自定义需要summarize的内容
students_dataset %>%
group_by(prog) %>% #optional,按照哪个类别分类,出来会是这几类
summarise(
mean = mean(grade),
variance = var(grade),
med = median(grade),
min = min(grade),
max = max(grade)
)
#自行选择两个类别进行summary并分析
read_csv("data/googleplaystore.csv") %>%
filter(Category == "GAME" | Category == "COMMUNICATION") %>%
select(App, Category, Rating) %>%
distinct(App, .keep_all = TRUE) %>%
group_by(Category) %>%
summarise(
mean = mean(Rating, na.rm = TRUE),
median = median(Rating, na.rm = TRUE)
)