Rev. | 86851c4580b9c84e6497d31e63a5fff6ac158664 |
---|---|
サイズ | 65,506 バイト |
日時 | 2018-06-08 05:29:21 |
作者 | Lorenzo Isella |
ログメッセージ | I added a black like for the borders of the countries in the map. |
\documentclass[12pt]{beamer}
\usepackage{graphicx}
% \usepackage[T1]{fontenc}
% \usepackage{emerald}
\usepackage{tikz}
\usepackage[labelformat=empty]{caption}
% \usepackage{cprotect}
\usepackage{listings}
\lstset{breaklines=true}
% <<setup, include=FALSE>>=
% library(knitr)
% render_listings()
% @
\usetheme{default}
\beamertemplatenavigationsymbolsempty
\hypersetup{pdfpagemode=UseNone} % don't show bookmarks on initial view
% named colors
\definecolor{offwhite}{RGB}{249,242,215}
\definecolor{foreground}{RGB}{255,255,255}
\definecolor{background}{RGB}{24,24,24}
\definecolor{title}{RGB}{107,174,214}
\definecolor{gray}{RGB}{155,155,155}
\definecolor{subtitle}{RGB}{102,255,204}
\definecolor{hilight}{RGB}{102,255,204}
\definecolor{vhilight}{RGB}{255,111,207}
\definecolor{lolight}{RGB}{155,155,155}
%\definecolor{green}{RGB}{125,250,125}
% use those colors
\setbeamercolor{titlelike}{fg=title}
\setbeamercolor{subtitle}{fg=subtitle}
\setbeamercolor{institute}{fg=gray}
\setbeamercolor{normal text}{fg=foreground,bg=background}
\setbeamercolor{item}{fg=foreground} % color of bullets
\setbeamercolor{subitem}{fg=gray}
\setbeamercolor{itemize/enumerate subbody}{fg=gray}
\setbeamertemplate{itemize subitem}{{\textendash}}
\setbeamerfont{itemize/enumerate subbody}{size=\footnotesize}
\setbeamerfont{itemize/enumerate subitem}{size=\footnotesize}
%% Grey (gray) Background Colour
\setbeamercolor{background canvas}{bg=gray!30!black}
%%%%Uncomment the part in the frame if you want the blackboard effect
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%% Random Dust Trails
%\pgfmathsetseed{\number\pdfrandomseed} % seed for random generator
% \setbeamertemplate{background}{
% \begin{tikzpicture}
% \useasboundingbox (0,0) rectangle (\the\paperwidth, \the\paperheight);
% \foreach \i in {1,...,30} {
% \pgfmathsetmacro{\x}{random(0,10000)/5000-1}%
% \pgfmathsetmacro{\y}{random(0,10000)/10000-0.1}%
% \pgfmathsetmacro{\r}{random(0,10000)/1000-5}%
% \rotatebox{\r}{
% \pgftext[at=\pgfpoint{\x\paperwidth}{\y\paperheight}, left, base]{\includegraphics[width=\textwidth]{paintstroke.png}}
% }
% };
% \end{tikzpicture}
% }
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%% Uncomment the part in the frame if you want to use the Augie font
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Now try to set the Augie font everywhere
% \setbeamerfont{framesubtitle}{series=\ECFAugie}
% \setbeamerfont{title}{series=\ECFAugie}
% \setbeamerfont{caption}{series=\ECFAugie}
% \setbeamerfont{author}{series=\ECFAugie}
% \setbeamerfont{institute}{series=\ECFAugie}
% \setbeamerfont{date}{series=\ECFAugie}
% \setbeamerfont{frametitle}{series=\ECFAugie}
% \setbeamerfont{item}{series=\ECFAugie}
% %% use a small dash ('-') for a bulletpoint list
% \setbeamertemplate{itemize item}{\usebeamercolor[fg]{item}\small\ECFAugie{-}}
% see https://tex.stackexchange.com/questions/320223/how-to-enforce-a-font-series-in-beamer-for-normal-default-text/320244
% \setbeamerfont{normal text}{series= \ECFAugie}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% page number
\setbeamertemplate{footline}{%
\raisebox{5pt}{\makebox[\paperwidth]{\hfill\makebox[20pt]{\color{gray}
\scriptsize\insertframenumber}}}\hspace*{5pt}}
% add a bit of space at the top of the notes page
\addtobeamertemplate{note page}{\setlength{\parskip}{12pt}}
% a few macros
\newcommand{\bi}{\begin{itemize}}
\newcommand{\ei}{\end{itemize}}
\newcommand{\ig}{\includegraphics}
\newcommand{\subt}[1]{{\footnotesize \color{subtitle} {#1}}}
% Compile with Rscript -e "library(knitr); knit('./R-course.Rnw')"
\title{Introduction to R}
\framesubtitle{A researcher's perspective}
\author{ {Lorenzo Isella}}
\institute{DG TRADE, G2, Chief Economist Team}
\date{June 7, 2018}
\AtBeginDocument{\usebeamerfont{normal text}}
\begin{document}
\frame{
\titlepage}
% \begin{frame}
% \frametitle{Harsh Reality}
% % \framesubtitle{Test Frame}
% % \subt{An optional subtitle}
% By the end of this training you will \underline{not}
% \begin{itemize}
% \item be a statistician/data analyst
% \item be an extremely proficient R user
% \item dump Excel for good.
% \end{itemize}
% You do not become an expert at using any non-trivial tool in 10
% hours.
% So what can you expect to get from this training?
% \end{frame}
% \begin{frame}
% \frametitle{What to Expect from this Training}
% % \framesubtitle{Test Frame}
% % \subt{An optional subtitle}
% On the other hand, by the end of this training you will
% \begin{itemize}
% \item know there is a tool able to make it easier to repeat simple tedious and error-prone data tasks
% \item know that data analytics is not about typing a handful of fancy
% excel commands
% \item know that you are not alone in your data struggle. Someone else
% most likely had the same issue tormenting you. With a bit of luck,
% she has already coded in R the solution you need!
% \end{itemize}
% \end{frame}
\begin{frame}
\frametitle{Overview and Goals}
R is a \underline{statistical environment}.
In this course we'll explore applications of R to perform
\begin{itemize}
\item Data manipulations ($\to$ answer statistical questions).
\item Data visualisations (more powerful and intuitive than a table).
\item Linear models [(mis)(ab)used everywhere, but loved for their
interpretability and ease of deployment].
\end{itemize}
\underline{Goal of the training}: illustrate what R can do in several fields of
statistics. \underline{You} will be able to choose for yourself where it suits you best.
\end{frame}
\begin{frame}
% \frametitle{Overview of the Training}
% \begin{itemize}
% \item Philosophy of the training: your goal is to get better,
% faster and more productive at data analysis.
% \item You are not interested in the 6 different kinds of atomic
% vectors in R.
% \item So we will go head over heels on the basics and
% \item plunge into the tidyverse. Tidyverse is a collection of tools
% for powerful and expressive data analysis and visualisation.
% \item We will barely scratch the surface of many topics, but you
% will have an idea of the state-of-the art R for data mining.
% \end{itemize}
% Do not expect to become and R guru in a few hours, but you will
% appreciate the advantages R can bring you in the long run.
% \end{frame}
% \begin{frame}
\frametitle{R and Statistical Computing}
R is a statistical environment bringing you
\begin{itemize}
\item an effective data handling and storage facility,
\item a suite of operators for calculations on arrays, in particular matrices,
\item a large, coherent, integrated collection of intermediate tools for data analysis,
\item graphical facilities for data analysis and display either directly at the computer or on hard-copy, and
\item a well developed, simple and effective programming language.
\end{itemize}
Most of the training will be devoted to making all the above much less abstract.
\end{frame}
\begin{frame}
\frametitle{Other Reasons to Use R}
\begin{itemize}
\item R is free and cross-platform. It runs on your Windows, MAC and
Linux machine. No fees no trial periods. Visit {\url{https://www.r-project.org/}}
\item R can be used to analyse your data and
produce \underline{publication-quality} visualisations.
\item R is extended by hundreds of high quality packages often
developed by leading specialists in their field.
\item R has a large user base ($>$ 1.000.000 users) and it is
\emph{de facto} a lingua franca for computational statistics.
% \item R runs on Windows, Linux and MAC computers and...it is all
% free!
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Running an R Session}
How you run your session depends on several factors.
A recommendation could be
\begin{itemize}
\item Download and install R for your platform from
\url{https://cran.rstudio.com/}
\item followed by R Studio (a set of tools to make you productive with
R) at \url{https://www.rstudio.com/products/rstudio/download/}
\item You can do your work in R as an interactive session, but most of
the time you will save your work (i.e. the commands you typed) as an \underline{R script}.
\item An R script is a plain, human readable text file traditionally with a
``.R'' extension and you can run it by typing
<< highlight=T, eval=F,message=F >>=
source("myRscript.R")
@
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Some Caveats}
The training will show you (some of) the things R can do. We'll cut a
few corners here and there. In particular
\begin{itemize}
\item we will not cover the basics at all (data types, data
structures, functions, plotting etc...)
\item we will jump head first into some applications, but all the
code and the data sets will be provided (everything in this training
is reproducible by you at home).
\end{itemize}
As a consequence, feel free to interrupt and ask \underline{any}
questions at \underline{any} moment during the presentation.
\end{frame}
\begin{frame}[fragile]
\frametitle{Inspirational Quote}
``Let us change our traditional attitude to the construction of programs. Instead of imagining that our main task is to instruct a computer what to do, let us concentrate rather on explaining to human beings what we want a computer to do.''
\vspace*{1.5cm}
Donald Knuth
\end{frame}
\begin{frame}[fragile]
\frametitle{Philosophy in Practice: Data Manipulation in R}
When working with data you must
\begin{itemize}
\item Figure out what \underline{you want to do}.
\item Describe those tasks in the form of a computer program.
\item Execute the program.
\end{itemize}
The dplyr package (part of the tidyverse) makes these steps fast and easy
\begin{itemize}
\item By constraining your options, it helps you think about your data manipulation tasks.
\item It provides simple “verbs”, functions that correspond to the most common data manipulation tasks, to help you translate your thoughts into code.
\item It uses efficient backends, so you spend less time waiting for the computer.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{A Toy Data Set}
``tempcity'' is data set of city temperature [in
$^{\circ}\mathrm{C}$] along 4 weeks.
\begin{table}[ht]
\centering
\begin{tabular}{rlr}
\hline
week & city & temperature \\
\hline
1 & A & 14 \\
1 & B & 18 \\
1 & C & 23 \\
2 & A & 15 \\
2 & B & 21 \\
2 & C & 24 \\
3 & A & 12 \\
3 & B & 25 \\
3 & C & 23 \\
4 & A & 13 \\
4 & B & 17 \\
4 & C & 25 \\
\hline
\end{tabular}
\end{table}
Let us ask it some questions and formulate them in R.
\end{frame}
\begin{frame}[fragile]
\frametitle{Excursus: Tidy Data}
The tempcity data set is a \underline{tidy} data set. Tidy data has the following properties
\begin{enumerate}
\item Each variable forms a column.
\item Each observation forms a row.
\item Each type of observational unit forms a table.
\end{enumerate}
You may not like a data set with many rows, but think that
\begin{itemize}
\item Tidy data makes it easy for an analyst or a computer to extract needed
variables because it provides a standard way of structuring a
dataset.
\item Other way to put it: you do not need different strategies to extract different variables.
\end{itemize}
Let us see how straightforward it can be to manipulate a tidy data set.
\end{frame}
\begin{frame}[fragile]
\frametitle{A Grammar for Data Manipulation}
dplyr is the main tidyverse tool for data manipulation. It is a \underline{grammar} because it provides \underline{verbs} that help you solve the most common data manipulation challenges:
\begin{itemize}
\item mutate() adds new variables that are functions of existing variables
\item select() picks variables based on their names.
\item filter() picks cases based on their values.
\item summarise() reduces multiple values down to a single summary.
\item arrange() changes the ordering of the rows.
\item group\verb|_|by() which allows you to perform any operation ``by group''.
\end{itemize}
There are several other verbs available, but this is another story.
\end{frame}
\begin{frame}[fragile]
\frametitle{Task 1}
\underline{Summarise} the temperature column: calculate the mean
temperature along all the weeks in all the cities
<< highlight=T, eval=TRUE,message=F >>=
library(tidyverse)
tempcity<-read_csv("tidy.csv")
mean_temp <- tempcity %>%
summarise(aver_temp=mean(temperature))
mean_temp
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Task 2}
\underline{Group} the data \underline{by} city and
\underline{summarise} it with the
mean temperature for every city.
<< highlight=T, eval=TRUE,message=F >>=
mean_temp_city <- tempcity %>%
group_by(city) %>%
summarise(aver_temp=mean(temperature))
mean_temp_city
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Task 3 (or 2 and 1/2)}
\underline{Group} the data \underline{by} week and then \underline{summarise} it with the
mean temperature for every week.
<< highlight=T, eval=TRUE,message=F >>=
mean_temp_week <- tempcity %>%
group_by(week) %>%
summarise(aver_temp=mean(temperature))
mean_temp_week
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Task 4}
\underline{Filter} the temperatures for all the cities for week 4.
<< highlight=T, eval=TRUE,message=F >>=
temp_week_4 <- tempcity %>%
filter(week==4)
temp_week_4
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Task 5}
\underline{Filter} the data for city A, \underline{mutate} the data
set by creating the temperature in Fahrenheit, and \underline{select} all the
columns apart from the temperature in Celsius.
<< highlight=T, eval=TRUE,message=F >>=
city_A_fahr <- tempcity %>%
filter(city=="A") %>%
mutate(Fahrenheit=temperature*1.8+32) %>%
select(-temperature)
city_A_fahr
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Not Just for Fun: Balance of Payment Data}
<< highlight=T, eval=TRUE,message=F, warning=F >>=
library(tidyverse)
df<-read_csv("bop_flow2.csv",
col_types = cols(Value = "d"))
@
Let us glimpse at the resulting table (only a few rows are shown and
they are broken into two chunks)
% df<-read_csv("bop_flow2.csv",col_types = cols(Value = "i"))
\begin{table}[ht]
\centering
\scalebox{0.7}{
\begin{tabular}{rlllll}
\hline
TIME & GEO & CURRENCY & NACE\_R2 & STK\_FLOW & STK\_FLOW\_LABEL \\
\hline
2016 & EU28 & Million euro & TOTAL & ASS & Assets \\
2016 & EU28 & Million euro & TOTAL & ASS & Assets \\
2016 & EU28 & Million euro & TOTAL & ASS & Assets \\
\hline
\end{tabular}
}
\end{table}
\begin{table}[ht]
\centering
\scalebox{0.7}{
\begin{tabular}{rllllr}
\hline
TIME & ENTITY & FDI\_ITEM & FDI\_ITEM\_LABEL & PARTNER & Value \\
\hline
2016 & TOTAL & DO\_\_D\_\_F & Direct investment abroad (DIA) & CH & NA \\
2016 & TOTAL & DO\_\_D\_\_F & Direct investment abroad (DIA) & TR & NA \\
2016 & TOTAL & DO\_\_D\_\_F & Direct investment abroad (DIA) & RU & NA \\
\hline
\end{tabular}
}
\end{table}
\end{frame}
\begin{frame}[fragile]
\frametitle{dplyr Verbs in Action 1/5}
In 2015, how many million euros did the EU28 (GEO) invest
(FDI\verb|_|ITEM is DO\verb|_|\verb|_|D\verb|_|\verb|_|F; ENTITY is TOTAL) in manufacture
(NACE\verb|_|R2 is C) in Japan (PARTNER is JP) as outward net foreign
direct investment (STK\verb|_|FLOW is NO)?
<< highlight=T, eval=TRUE,message=F >>=
library(tidyverse)
manu_JP <- df %>%filter(TIME==2015, GEO=="EU28",
STK_FLOW=="NO",FDI_ITEM=="DO__D__F",
ENTITY=="TOTAL",PARTNER=="JP", NACE_R2=="C") %>%
select(TIME, GEO, PARTNER, NACE_R2, Value)
manu_JP
@
\end{frame}
\begin{frame}[fragile]
\frametitle{dplyr Verbs in Action 2/5}
And the total FDI to the US for all years
<< highlight=T, eval=TRUE,message=F >>=
FDI_US <- df %>%filter( GEO=="EU28",
STK_FLOW=="NO",FDI_ITEM=="DO__D__F",
ENTITY=="TOTAL",PARTNER =="US",NACE_R2=="FDI") %>%
select(TIME, GEO, PARTNER, NACE_R2, Value)
FDI_US
@
\end{frame}
\begin{frame}[fragile]
\frametitle{dplyr Verbs in Action 3/5}
Add a new column with the FDI in dollars (1 EUR = 1.21 USD)
<< highlight=T, eval=TRUE,message=F >>=
FDI_US <- df %>%filter( GEO=="EU28",
STK_FLOW=="NO",FDI_ITEM=="DO__D__F",
ENTITY=="TOTAL",PARTNER =="US",NACE_R2=="FDI") %>%
select(TIME, GEO, PARTNER, NACE_R2, Value)%>%
mutate(ValueUSD=Value*1.21)
FDI_US
@
\end{frame}
\begin{frame}[fragile]
\frametitle{dplyr Verbs in Action 4/5}
And if you want the average FDI to the US along the available years
<< highlight=T, eval=TRUE,message=F >>=
FDI_US_mean <- df %>%filter( GEO=="EU28",
STK_FLOW=="NO",FDI_ITEM=="DO__D__F",
ENTITY=="TOTAL",PARTNER =="US", NACE_R2=="FDI")%>%
summarise(mean_FDI_to_US=mean(Value))
FDI_US_mean
@
\end{frame}
\begin{frame}[fragile]
\frametitle{dplyr Verbs in Action 5/5}
Now you want to do the same for US and India in one go
\vspace*{-0.2cm}
<< highlight=T, eval=TRUE,message=F >>=
FDI_US_IN <- df %>%filter( GEO=="EU28",
STK_FLOW=="NO",FDI_ITEM=="DO__D__F",
ENTITY=="TOTAL",PARTNER %in% c("US", "IN"),
NACE_R2=="FDI")%>%
group_by(PARTNER) %>%
summarise(mean_FDI=mean(Value))
FDI_US_IN
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Final Thoughts on Data Manipulation}
\begin{itemize}
\item We barely scratched the surface of dplyr
\item but we have already seen filter, selection of columns,
creation of new columns and
computing statistics on groups of variables
\item thanks to the pipe operator (\verb|%>%|), most of the code that you write
is reusable and readable. You organise dplyr verbs into pipes.
\item you do not worry about cells, indexes etc..., but you think
more about the questions you want to pose to your data.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{What is a Linear Model?}
A linear model tries to predict the value of a dependent variable $y$
based on the observed value of a set of dependent variables (also
called regressors) $\{x_{1},
x_{2},\cdots x_{N} \}$. Typically it assumes we can write
\begin{equation}
y=a_{0}+a_{1}x_{1}+a_{2}x_{2}+\cdots + a_{N}x_{N}
\end{equation}
\begin{itemize}
\item We will not discuss the theory for determining $a_{0},
a_{1}\cdots a_{N}$ here.
\item They are widely (ab)(mis)used in economics, biology, etc...
due to their relatively low computational complexity and ease of interpretation.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Linear Models in R}
\begin{itemize}
% \item R has everything you need for sophisticated statistical models (linear
% models, random forests, kernel methods, neural networks, etc...)
% \item however linear models are (ab)used almost everywhere due to
% their ease of implementation
\item In R the interface to deploy a linear model is as simple as
<< eval=F, highlight=T>>=
lin_model<-lm(response~var1+var2+var3,
data=mydata)
@
where ``response'' is the independent ($y$) variable and ``var1'' is
an observable (independent variable).
\item Caveat: being able to successfully run a linear model does not
imply at all that the
model specification is appropriate to the data under
scrutiny. \underline{Never} deploy a model whose underlying theory is totally obscure to you.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{A Toy Model}
% We show an example with artificial data
<< f1, highlight=T, eval=TRUE,message=F, fig.height=3 >>=
set.seed(1234) ## for reproducibility
x<- seq(0, 20, len=200)
y<- 10+0.5*x+rnorm(200) #intercept=10, slope=0.5
lin_model<-lm(y~x)
par(bg = 'white') # to set a white background
plot(x, y, ylim=c(5, 25) )
@
% We have a single independent variable ($y$) and we know the ``real''
% coefficients: intercept $=10$ and slope $=0.5$.
% <<fig.height=3, eval=TRUE, dev='png'>>=
% plot(x, y, ylim=c(5, 25))
%@
\end{frame}
% \begin{frame}[fragile]
% \frametitle{Inspect the Results}
% \begin{table}[ht]
% \centering
% \begin{tabular}{rrrrr}
% \hline
% & Estimate & Std. Error & t value & Pr($>$$|$t$|$) \\
% \hline
% (Intercept) & 9.6647 & 0.1435 & 67.37 & 0.0000 \\
% x & 0.5028 & 0.0012 & 406.18 & 0.0000 \\
% \hline
% \end{tabular}
% \end{table}
% Residual standard error: 1.011 on 198 degrees of freedom
% Multiple R-squared: 0.9988, Adjusted R-squared: 0.9988
% F-statistic: 1.65e+05 on 1 and 198 DF, p-value: $<$ 2.2e-16
% \end{frame}
\begin{frame}[fragile]
\frametitle{Inspect the Results}
Use ``summary'' to inspect the model
<< highlight=T, eval=F,message=F >>=
summary(lin_model)
@
\begin{table}
\begin{center}
\begin{tabular}{l c }
\hline
& Toy Model \\
\hline
(Intercept) & $9.67^{***}$ \\
& $(0.14)$ \\
x & $0.53^{***}$ \\
& $(0.01)$ \\
\hline
R$^2$ & 0.90 \\
Adj. R$^2$ & 0.90 \\
Num. obs. & 200 \\
RMSE & 1.01 \\
\hline
\multicolumn{2}{l}{\scriptsize{$^{***}p<0.001$, $^{**}p<0.01$, $^*p<0.05$}}
\end{tabular}
% \caption{Statistical models}
\label{table:coefficients}
\end{center}
\end{table}
The number of * indicates the level of statistical significance of the
estimated coefficients.
\end{frame}
\begin{frame}[fragile]
\frametitle{Can We See What the Model Looks Like?}
\begin{itemize}
\item Use the ``predict'' function.
\end{itemize}
<< f2, highlight=T, eval=TRUE,message=F, fig.height=3 >>=
par(bg = 'white')
plot(x, y, ylim=c(5, 25))
lines(x, predict(lin_model), "l",
col="red", lwd=2)
@
\end{frame}
\begin{frame}[fragile]
\frametitle{GDP in Catalonia}
Goal: develop a model for the GDP of Catalonia based on
\begin{enumerate}
\item Consumer expenditure = CHouse
\item Consumer public administrations = CAdm
\item Equipment of goods and others (capital investment without construction) = Equip
\item Construction = Const
\item Total exports goods and services = Exp
\item Total imports goods and services = Imp
\end{enumerate}
The \underline{tidy} data has been extracted from the Idescat, economic annual
Accounts of Catalonia. See \url{https://frama.link/CK47Qtm4}.
\end{frame}
\begin{frame}[fragile]
\frametitle{A Look at the Numbers}
All units of the DataFrame are presented in Millions of euros (Base 2010).
% latex table generated in R 3.5.0 by xtable 1.8-2 package
% Mon May 14 16:12:11 2018
\begin{table}[ht]
\centering
\scalebox{0.7}{
\begin{tabular}{rrrrrrrr}
\hline
year & GDP & CHouse & CAdm & Equip & Const & Exp & Imp \\
\hline
2016 & 223629 & 121449 & 34998 & 24940 & 14443 & 84668 & 71767 \\
2015 & 215641 & 118176 & 34032 & 23721 & 13508 & 81910 & 69261 \\
2014 & 208018 & 114235 & 32069 & 22150 & 12848 & 77583 & 66549 \\
2013 & 203198 & 111596 & 32077 & 20939 & 13129 & 75112 & 62172 \\
2012 & 203856 & 114317 & 32760 & 21564 & 14834 & 73578 & 63371 \\
2011 & 209716 & 115228 & 34854 & 22148 & 18659 & 69556 & 64510 \\
2010 & 209792 & 113859 & 35569 & 23060 & 21820 & 62441 & 61248 \\
2009 & 208115 & 111997 & 35451 & 22284 & 25123 & 55052 & 53964 \\
2008 & 216922 & 117445 & 32709 & 25736 & 32901 & 65416 & 71719 \\
2007 & 212391 & 114653 & 29674 & 25658 & 34499 & 64427 & 75399 \\
2006 & 199169 & 107097 & 26821 & 22621 & 32668 & 60811 & 70747 \\
2005 & 183507 & 99316 & 24441 & 20041 & 28678 & 55214 & 64700 \\
2004 & 169776 & 91997 & 22367 & 18015 & 25326 & 50971 & 58883 \\
2003 & 156600 & 84675 & 20262 & 16702 & 22773 & 48795 & 53268 \\
2002 & 145813 & 79584 & 18575 & 15388 & 20361 & 48685 & 51608 \\
2001 & 137165 & 74692 & 17131 & 14924 & 18356 & 47344 & 50940 \\
2000 & 127839 & 69916 & 15832 & 14372 & 16329 & 43650 & 49523 \\
\hline
\end{tabular}
}
\end{table}
\end{frame}
\begin{frame}[fragile]
\frametitle{A First Model}
\vspace*{-0.3cm}
<< highlight=T, eval=T,message=F , echo=2 >>=
gdp_data <- read_csv("gdp_catalonia.csv")
gdp_model1 <- lm(GDP~CHouse+CAdm+Equip+Const+
Exp+Imp, data=gdp_data)
@
\vspace*{-0.2cm}
\begin{table}
\begin{center}
\scalebox{0.75}{
\begin{tabular}{l c }
% \hline
% & Model 1 \\
% \hline
(Intercept) & $3732.369$ \\
& $(5345.222)$ \\
CHouse & $1.523^{**}$ \\
& $(0.466)$ \\
CAdm & $0.193$ \\
& $(0.888)$ \\
Equip & $0.869$ \\
& $(0.941)$ \\
Const & $0.328$ \\
& $(0.439)$ \\
Exp & $0.183$ \\
& $(0.386)$ \\
Imp & $-0.217$ \\
& $(0.328)$ \\
\hline
R$^2$ & 0.998 \\
Adj. R$^2$ & 0.997 \\
Num. obs. & 17 \\
RMSE & 1610.421 \\
\hline
\multicolumn{2}{l}{\scriptsize{$^{***}p<0.001$, $^{**}p<0.01$, $^*p<0.05$}}
\end{tabular}
}
% \caption{Statistical models}
% \label{table:coefficients}
\end{center}
\end{table}
\end{frame}
\begin{frame}[fragile]
\frametitle{A Trimmed Model}
Only the coefficient of the household consumption appears to be
significant so let us fit
<< highlight=T, eval=T,message=F >>=
gdp_model2 <- lm(GDP~CHouse, data=gdp_data)
@
\begin{table}
\begin{center}
\begin{tabular}{l c }
\hline
& Simplified Model \\
\hline
(Intercept) & $1381.530$ \\
& $(3346.754)$ \\
CHouse & $1.822^{***}$ \\
& $(0.032)$ \\
\hline
R$^2$ & 0.995 \\
Adj. R$^2$ & 0.995 \\
Num. obs. & 17 \\
RMSE & 2146.172 \\
\hline
\multicolumn{2}{l}{\scriptsize{$^{***}p<0.001$, $^{**}p<0.01$, $^*p<0.05$}}
\end{tabular}
% \caption{Statistical models}
% \label{table:coefficients}
\end{center}
\end{table}
\end{frame}
\begin{frame}[fragile]
\frametitle{Check the Linear Relation}
See by naked eye if there really is a linear relationship between GDP
and household consumption.
<<f3bis, highlight=T, eval=TRUE,message=F, fig.height=3, echo=-1 >>=
par(bg="white")
plot( gdp_data$CHouse,gdp_data$GDP)
lines(gdp_data$CHouse, predict(gdp_model2), "l",
col="red", lwd=2)
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Model Prediction for GDP}
GDP data and model as a time series (year on x axis).
<<f3, highlight=T, eval=TRUE,message=F, fig.height=3, echo=-1 >>=
par(bg="white")
plot(gdp_data$year, gdp_data$GDP)
lines(gdp_data$year, predict(gdp_model2), "l",
col="red", lwd=2)
@
If fresher data and/or forecast on the household expenditure is available, you can use
it to predict the GDP for Catalonia.
\end{frame}
\begin{frame}[fragile]
\frametitle{Data Visualisation}
\begin{itemize}
\item Data visualisation is a HUGE topic. We will not even pretend any
completeness here.
\item It is OK to get the numbers out as a table, but a lot of
features are not evident at all in a set of numbers.
\item The overwhelming majority of people understand images better
than numbers.
\item Caveat: presenting data clearly and effectively is a separate
topic (requiring a separate training!) from programming.
\item There is \underline{not} such a thing as a point and click
application for making your plots in R.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Basic Plotting in R}
We have already encountered the ``plot'' command
<<f4, fig.height=3 , highlight=T, eval=TRUE,message=F >>=
par(bg="white")
set.seed(1213) # for reproducibility
x <- cumsum(rnorm(100))
plot(x, type = 'l') # Brownian motion
@
\end{frame}
\begin{frame}[fragile]
\frametitle{R and Plots}
Pretty much any kind of statistical visualization can be
produced with R
\begin{itemize}
\item line plots
\item histograms/bar charts
\item boxplots
\item heat maps
\item etc...
\end{itemize}
There are \underline{a lot} of specialised tools
beyond ``plot'' to carry out these tasks. ggplot2 (part of the
tidyverse) is very popular. Let's see why.
\end{frame}
\begin{frame}[fragile]
\frametitle{What is ggplot2?}
\begin{itemize}
\item Formal answer: ggplot2 is a system for declaratively creating
graphics which operationalizes the principles laid down in ``The
Grammar of Graphics'' by Wilkinson.
\item In practice: you provide the data, tell ggplot2 how to map variables to aesthetics, what graphical primitives to use, and it takes care of the details.
\item ggplot2 is usually per se the object of dedicated trainings, so
do not worry if some (most) of this is obscure now, but it is
important to know perhaps the most widespread data visualisation
tool in R nowadays.
\item One (of the many) tutorials on ggplot2 can be found at \url{http://r-statistics.co/Complete-Ggplot2-Tutorial-Part1-With-R-Code.html}
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{ggplot2: the Setup}
\begin{itemize}
\item Tell ggplot which data set to use
\item Specify the \underline{aesthetics}: variables for the $x$ and
$y$ axis, the variables based on which the color, size, shape and
stroke should change, etc...
<<highlight=T, eval=F,message=F >>=
gpl<-ggplot(data = tempcity, aes(x = week,
y = temperature, shape=city, colour=city,
linetype=city)
@
\item If you stop here, ggplot2 will plot \underline{nothing}. It does
not assume you want a scatterplot, a barchart or anything more
exotic. To visualise something, you need to add \underline{layers}.
% First, you need to tell ggplot what dataset to use. This is done using the ggplot(df) function, where df is a dataframe that contains all features needed to make the plot. This is the most basic step. Unlike base graphics, ggplot doesn’t take vectors as arguments.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{ggplot2: the Layers 1/3}
\begin{itemize}
\item Layers in ggplot2 are also called ``geoms''. To plot your data as a points,
you start with the basic setup
\vspace*{-0.1cm}
<<highlight=T, eval=T,message=F >>=
gpl<-ggplot(data = tempcity, aes(x = week,
y = temperature,shape=city, colour=city,
linetype=city))
@
and you add one layer for the point visualisation
\vspace*{-0.1cm}
<<f5bis, highlight=T, eval=T,message=F , fig.height=3 >>=
gpl+ geom_point()
@
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{ggplot2: the Layers 2/3}
To visualise the previous data as a line plot, use another layer (\verb|geom_line|)
<<f5tris,highlight=T, eval=T,message=F, fig.height=3 >>=
gpl<-ggplot(data = tempcity, aes(x = week,
y = temperature,shape=city, colour=city,
linetype=city))
gpl+geom_line()
@
\end{frame}
\begin{frame}[fragile]
\frametitle{ggplot2: the Layers 3/3}
\vspace*{-0.1cm}
Layers can be combined to give rise to more elaborate
visualisations. To plot the tempcity data with lines and points, just
add the two layers one on top of the other
\vspace*{-0.2cm}
<<f5tetra,highlight=T, eval=T,message=F, fig.height=3 >>=
gpl<-ggplot(data = tempcity, aes(x = week,
y = temperature,shape=city, colour=city,
linetype=city))
gpl+geom_point()+geom_line()
@
\end{frame}
\begin{frame}[fragile]
\frametitle{The Theme}
Almost everything is set. You may want to use a certain size for the
axis labels, the plot title, change the background colour etc...
This is done by the \verb|theme()| function, but ggplot2 also provides you
with a set of pre-defined themes.
\verb|theme_gdocs()| will provide a result reminiscent of Google
Docs. Further polishing may involve choosing your favourite colour
scale, the width of the lines and the size of the points and so on.
\end{frame}
\begin{frame}[fragile]
\frametitle{Putting It All Together}
\vspace*{-0.3cm}
<< f5, highlight=T, eval=T,message=F, echo=4, fig.height=3 >>=
library(ggplot2)
library(ggthemes)
library(viridis)
ggplot(data = tempcity, aes(x = week,
y = temperature,shape=city, colour=city,
linetype=city)) +
geom_point(size=3)+ geom_line(size=1) +
scale_colour_viridis(discrete=T)+theme_gdocs()+
labs(title="Temperature & the city")+
xlab("Week")+ ylab("Temperature")
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Panel Plots (Facets)}
\begin{itemize}
\item In the previous chart, you had the scatterplot combined with a line plot for all different values of ``city'' plotted in the same chart. What if you want one chart for one city?
\item The answer is
<<highlight=T, eval=F,message=F >>=
facet_wrap(formula)
@
which takes in a formula as the argument. The item on the RHS corresponds to the column. The item on the LHS defines the rows.
\item Panel plots can be produced with other software different from
R, but they are often hard to make, ugly or require a lot of manual fiddling.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Faceting in Action (Good Luck with Excel...)}
\vspace*{-0.3cm}
<< f6, highlight=T, eval=T,message=F, echo=4, fig.height=3 >>=
library(ggplot2)
library(ggthemes)
library(viridis)
ggplot(data = tempcity, aes(x = week,
y = temperature)) +
geom_point(size=3)+ geom_line(size=1) +
facet_wrap(~city, nrow=1)+theme_gdocs()+
labs(title="Temperature & the city")+
xlab("Week")+ ylab("Temperature")
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Plotting Geographical Information}
A very common task. You have per country data and you want to
illustrate the geographical dimension of the data set. For instance
\begin{itemize}
\item you are asked to illustrate the situation of the debt vs the GDP
for the EU28.
\item IMF among others has the data, but you can retrieve it from
Wikipedia
\url{https://en.wikipedia.org/wiki/List_of_countries_by_public_debt}
\item you can produce a table, but it is ``dry'' and not very appealing.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Look at the Debt Data}
<< highlight=T, eval=T,message=F >>=
debt <- read_csv("country_debt.csv")
head(debt)
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Get the World Map}
The ``world'' is stored a set of latitudes/longitudes and various meta data.
<< highlight=T, eval=T,message=F >>=
df.map <- map_data('world') %>% as_tibble %>%
rename(country = region)
head(df.map)
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Ready to Plot}
Join geographical and debt data and plot them.
<< f7, highlight=T, eval=T,message=F, fig.height=3 >>=
world_debt<-left_join(df.map,debt,by='country')
ggplot(data=world_debt,aes(x = long, y = lat,
group = group)) +
geom_polygon(aes(fill = debt_as_pct_of_gdp))
@
\end{frame}
\begin{frame}[fragile]
\frametitle{A bit of Polishing}
We achieved quite a lot with a few lines of code. We need some extra
effort to make this map look nicer.
<< highlight=T, eval=T,message=F >>=
theme.map <- theme(
text = element_text(color = '#444444')
,panel.background = element_rect(fill = '#CCCCCC')
,plot.background = element_rect(fill = '#CCCCCC')
,legend.background = element_rect(fill ='#CCCCCC')
,panel.grid = element_blank()
,plot.title = element_text(size = 18, face='bold')
,plot.subtitle = element_text(size = 12)
,legend.key = element_blank()
,axis.text = element_blank()
,axis.ticks = element_blank()
,axis.title = element_blank()
)
@
\end{frame}
% colors = c('#009933', '#ffff00',
% 'orange','#e60000')
% ,values = scales::rescale(c(30, 50, 70, 100, 200))
\begin{frame}[fragile]
\frametitle{Apply the Theme and Use some Labels}
\vspace*{-0.25cm}
<< f8, highlight=T, eval=T,message=F, fig.height=3, echo=-2 >>=
gpl<-ggplot(world_debt,aes(x=long, y=lat,
group=group))+
geom_polygon(aes(fill = debt_as_pct_of_gdp ),
colour = "black", size=0.1) +
theme.map +labs(title = str_c('Debt as %GDP ')
,fill = str_c('Net public debt\nas % of GDP')) +
scale_fill_viridis(na.value = "grey50")
gpl
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Focus on Europe}
<< f9, highlight=T, eval=T,message=F, fig.height=5. >>=
gpl+coord_cartesian(xlim=c(-11,33), ylim=c(35,70))
@
\end{frame}
%\begin{frame}[fragile]
% \frametitle{ggplot2: an Advanced Graphic System for R}
% \begin{itemize}
% \item ggplot2 is a system for declaratively creating graphics, based on The Grammar of Graphics.
% \item You provide the data, tell ggplot2 how to map variables to aesthetics, what graphical primitives to use, and it takes care of the details.
% \item It takes some serious effort to learn, but it is very powerful.
% \item dplyr is a grammar
% \end{itemize}
% \end{frame}
\begin{frame}[fragile]
\frametitle{Open Conclusions}
\begin{itemize}
\item Moving from Excel to R is a big leap: you do not have any longer
a worksheet and a click (and pray) interface.
\item One of the benefits of R is the possibility of recycling your
code: you never really start from scratch.
\item Reproducibility of your work: do not worry about you or your
colleague to accidentally changing the value of a cell.
\item A very supportive community: often you get the answer to what
troubles you from the very developer of the package/function.
\item In the same environment you read, process, plot/output your
data. Forget about cumbersome transferring of data between applications.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Annex}
The slides from now on delve into the basics and other more technical
parts of R and data tidying.
\begin{itemize}
\item They are not essential for getting a general idea of what R
can do for you, but
\item they cover also some nitty-gritty details you need to cope with
when you use R
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Basic Operations in R}
At the very least, you can use R as a calculator
% <<foo, fig.height=4>>=
<<highlight=T>>=
1+1
2/3
@
but there is much more to it.
\end{frame}
\begin{frame}[fragile]
\frametitle{Basic Plotting in R}
One of the strengths of R is the ease of generating good-looking plots
<<mylab, fig.height=3 , highlight=T, eval=TRUE,message=F >>=
set.seed(1213) # for reproducibility
x <- cumsum(rnorm(100))
par(bg = 'white')
plot(x, type = 'l') # Brownian motion
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Basic Statistics in R}
You have plenty of in-built functions to calculate your statistics
<<my-label2 , highlight=T, eval=TRUE>>=
set.seed(1213) # for reproducibility
x <- cumsum(rnorm(100))
mean(x)
median(x)
sum(x)
@
\end{frame}
% \begin{frame}[fragile]
% \frametitle{Advanced Tools for Data Analysis}
% We split the dataset of flights in New York City airport into
% individual planes and then
% summarise each plane by counting the number of flights (count = n())
% and computing the average distance (dist = mean(distance, na.rm =
% TRUE)) and arrival delay (delay = mean(arr{\verb|_|}delay, na.rm = TRUE)).
% << highlight=T, eval=TRUE,message=F >>=
% library(nycflights13)
% library(tidyverse)
% by_tailnum <- group_by(flights, tailnum)
% delay <- summarise(by_tailnum,
% count = n(),
% dist = mean(distance, na.rm = TRUE),
% delay = mean(arr_delay, na.rm = TRUE))
% @
% This will be made clear later on. Just notice this is almost human-readable.
% \end{frame}
\begin{frame}[fragile]
\frametitle{Data Types in R}
The \underline{basic} data types in R are
\begin{itemize}
\item character: "a", "swc"
\item numeric: 2, 15.5
\item integer: 2L (the L tells R to store this as an integer)
\item logical: TRUE, FALSE
\item complex: 1+4i (complex numbers with real and imaginary parts)
\end{itemize}
You can also create your own data types and/or data structures, but we will not discuss this
in these notes. Later on, we will meet tibbles -- the tidyverse
reinterpretation of base R data frames.
\end{frame}
\begin{frame}[fragile]
\frametitle{How Data is Structured in R}
R operates on named data structures
\begin{itemize}
\item vectors
\item lists
\item matrices
\item arrays
\item data frames
\end{itemize}
and in R you can write \emph{functions} to powerfully extend the language.
\end{frame}
\begin{frame}[fragile]
\frametitle{Vectors 1/2}
A vector is a sequence of data elements of the same basic type.
<< eval=TRUE>>=
v1 <- c(2, 3, 5) # numeric values
v2 <- c(TRUE, FALSE, TRUE) # logical values
v3 <- c("aa", "bb", "cc", "dd", "ee") ## strings
@
You can do e.g. arithmetic on numeric vectors
<< eval=TRUE, highlight=T>>=
a <- c(2, 3, 5)
b <- c(5, -1, 6)
a+2
a+b
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Vectors 2/2}
You can join and/or subset vectors and you have facilities to easily
generate some sequences
<< eval=TRUE, highlight=T>>=
a <- c(2, 3, 5)
b <- c(5, -1, 6)
c(a,b)
a[2:3]
seq(2, 8, by= 2)
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Lists 1/3}
A list generalises the idea of a vector. It can hold items of
different types. The name tag is optional
\vspace*{-0.2cm}
<< eval=TRUE, highlight=F>>=
Lst <- list(name="Fred", wife="Mary",
no.children=3,child.ages=c(4,7,9))
Lst
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Lists 2/3}
List size can be increased on the fly. List contents can be accessed either by index or by name
<< eval=TRUE, highlight=F>>=
Lst$name
Lst[[1]]
Lst[1]
@
Note the difference between and $[[\cdots]]$ (extracts an element
from a list, drops the name tag) $[\cdots]$ (creates a sublist, keeps
name tag).
\end{frame}
\begin{frame}[fragile]
\frametitle{Lists 3/3}
Lists can be concatenated and increased on the fly
\vspace*{-0.3cm}
<< eval=TRUE, highlight=F>>=
ls1 <- list("aa", 2.3)
ls2 <- list("bb", 4.5)
ls1 <- c(ls1, ls2)
ls1
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Matrices and Arrays 1/3}
An array can be considered as a multiply subscripted collection of data entries, for example
numeric. A matrix is a 2-dimensional array, but it is such an important
special case that R contains many operators and functions that are
available only for matrices.
<< eval=TRUE, highlight=F>>=
x <- array(1:4, dim=c(2,2))
x
y <- matrix(1:4, 2,2)
y
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Matrices and Arrays 2/3}
We can slice a matrix by selecting its columns/rows or a single entry
<< eval=TRUE, highlight=F>>=
z <- matrix(5:8, 2,2)
z
z[2,]
z[ , 1]
z[2,1]
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Matrices and Arrays 3/3}
We can join matrices by rows and columns
<< eval=TRUE, highlight=F>>=
cbind(y,z)
rbind(y,z)
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Data Frames 1/2}
Data frames are similar to tables in data bases. Each column holds the
same type, and the columns can have header names. A data frame is more general than a matrix, in that different columns can have different modes (numeric, character, factor, etc.)
<< eval=TRUE, highlight=T>>=
people = c("Alex", "Barb", "Carl") # col 1
ages = c(19, 29, 39) # col 2
df = data.frame(people, ages) # create
names(df) = c("NAME", "AGE") # headers
df
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Data Frames 2/2}
We can slice a data frame like a matrix or also select its columns by name
<< eval=TRUE, highlight=T>>=
df[ ,1]
df$NAME
@
Internally, R sees a data frame as a list with class ``data.frame''.
\end{frame}
\begin{frame}[fragile]
\frametitle{Mutability of Data Structures}
Of course all the data structures in R can be altered. We use ``='' or
``\verb|<-|'' to assign values.
See for instance
<< eval=TRUE, highlight=T>>=
x <- c(1,2,3)
x[2] <- -4
x
#and sometimes the puzzling
y =2
y= y +7 # new y = old y +7
y
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Mutability of Data Structures -- Small Caveat}
We saw that ``='' can be used to assign a value. Instead, ``==''
is a \underline{logical} operator that checks if
two values/objects are identical.
See for instance
<< eval=TRUE, highlight=T>>=
x = 2
x
x == 2
x == 3
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Functions in R 1/2}
A function is defined by an assignment of the form
<< eval=F, highlight=T >>=
name <- function(arg_1, arg_2, …) expression
@
The expression is an R expression that uses the arguments, arg\verb|_|i, to calculate a value. The value of the expression is the value returned for the function.
mean(), sum(), cumsum(), c(), are examples of R in-built functions we have
already met.
\end{frame}
\begin{frame}[fragile]
\frametitle{Functions in R 2/2}
Example functions of one and two variables.
<< eval=T, highlight=T >>=
double <- function(x){ x*2}
double_and_triple <- function(x,y) {c(x*2, y*3) }
a <-7
b <- 5
double(a)
double_and_triple(a,b)
@
\end{frame}
% \begin{frame}[fragile]
% \frametitle{Functions in R 3/2}
% A technical remark: functions do \underline{not} modify their own arguments
% \end{frame}
\begin{frame}[fragile]
\frametitle{Data Input and Output in R}
\begin{itemize}
\item R provides a number of facilities to import external data in different
formats (csv file, excel workbook, SQL data base, STATA dat file, etc...).
\item I personally work most of the time with csv files, which can be
input/output by Excel. For importing and manipulating data, I recommend the
tidyverse library.
\end{itemize}
<< eval=F, highlight=T>>=
library(tidyverse)
# read data
mydata<-read_csv("filename.csv")
# write data
write_csv(mydata, "my_output_data.csv")
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Long Computations in R}
R is a functional language, which means that your code often contains a lot of parenthesis, ( and ). When you have complex code, this often will mean that you will have to nest those parentheses together. This makes your R code hard to read and understand.
<< eval=T, highlight=T>>=
## generate some arbitrary data
x<-c(1e4, 1.1e4, 2.3e4, 1.8e4,7e4,4.1e4)
# Compute the logarithm of `x`, return suitably
# lagged and iterated differences,
# compute the exponential function
# and round the result
round(exp(diff(log(x))), 1)
@
\end{frame}
% \begin{frame}[fragile]
% \frametitle{Long Computations in R}
% Computations can often result in expressions which are hard to read.
% << eval=T, highlight=T>>=
% ## generate some arbitrary data
% x<-c(1e4, 1.1e4, 2.3e4, 1.8e4,7e4,4.1e4)
% # Compute the logarithm of `x`, return suitably
% # lagged and iterated differences,
% # compute the exponential function
% # and round the result
% round(exp(diff(log(x))), 1)
% @
% Wouldn't it be nice to have a way to express these operations which is
% easy to read and understand?
% \end{frame}
\begin{frame}[fragile]
\frametitle{Enters the Pipe Operator}
The pipe operator \verb|%>%| has two fundamental properties
\begin{enumerate}
\item Function $f(x)$ can be rewritten as $x$ \verb|%>%| $f$
<< eval=T, highlight=F >>=
x <- 10
# Compute the logarithm of `x`
log(x)
x %>% log()
@
\item Function $f(x, y)$ can be rewritten as $x$ \verb|%>%| $f(y)$
<< eval=T, highlight=F >>=
# Round pi
round(pi, 6)
pi %>% round(6)
@
\end{enumerate}
\end{frame}
\begin{frame}[fragile]
\frametitle{Why was This Invented at All?}
The pipe operator \verb|%>%| provides you with a number of benefits
\begin{enumerate}
\item You'll structure the sequence of your data operations from left to right, as opposed to from inside and out;
\item You'll avoid nested function calls;
\item You'll minimize the need for local variables and function definitions; and
\item You'll make it easy to add steps anywhere in the sequence of operations.
\end{enumerate}
<< eval=F, highlight=T >>=
log(sin(sqrt(x))) # becomes
x %>% sqrt() %>%
sin() %>%
log() #much easier to follow!
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Application to the Previous Example}
This sounds very abstract, but let us see \verb|%>%| in action
<< eval=T, highlight=T>>=
library(tidyverse)
x<-c(1e4, 1.1e4, 2.3e4, 1.8e4,7e4,4.1e4)
x %>% log() %>%
diff() %>%
exp() %>%
round(1)
@
Now you finally understand what is going on. Cleaner code is easier to
share and extend.
\end{frame}
\begin{frame}[fragile]
\frametitle{Modify a Sequence of Computations}
Now that the operations are laid out as a sequence, it is much easier to modify them whenever we need to. For instance
<< eval=T, highlight=T>>=
# Compute the logarithm of `x`, return suitably
# lagged and iterated differences,
# compute the mean
# and round the result with two digits
library(tidyverse)
x %>% log() %>%
diff() %>%
mean() %>%
round(2)
@
\end{frame}
% \begin{frame}[fragile]
% \frametitle{Tidyverse and R}
% \begin{itemize}
% \item R is extended by packages, i.e. collections of tools/functions
% for a variety of purposes.
% \item The tidyverse (\url{https://www.tidyverse.org/}) is an opinionated collection of R packages designed for data science. All packages share an underlying design philosophy, grammar, and data structures.
% \item Personal opinion: it will take you some time to understand
% the tidyverse, but then you will never look back.
% \end{itemize}
% \end{frame}
% \begin{frame}[fragile]
% \frametitle{dplyr -- Data Manipulation 1/2}
% dplyr (part of the tidyverse family) is a \underline{grammar of data manipulation}.
% When working with data you must
% \begin{itemize}
% \item Figure out what you want to do.
% \item Describe those tasks in the form of a computer program.
% \item Execute the program.
% \end{itemize}
% The dplyr package makes these steps fast and easy
% \begin{itemize}
% \item By constraining your options, it helps you think about your data manipulation challenges.
% \item It provides simple “verbs”, functions that correspond to the most common data manipulation tasks, to help you translate your thoughts into code.
% \item It uses efficient backends, so you spend less time waiting for the computer.
% \end{itemize}
% % filter() to select cases based on their values.
% % arrange() to reorder the cases.
% % select() and rename() to select variables based on their names.
% % mutate() and transmute() to add new variables that are functions of existing variables.
% % summarise() to condense multiple values to a single value.
% % sample_n() and sample_frac() to take random samples.
% \end{frame}
% \begin{frame}[fragile]
% \frametitle{dplyr -- Data Manipulation 2/2}
% dplyr is a grammar because it provides verbs that help you solve the most common data manipulation challenges:
% \begin{itemize}
% \item mutate() adds new variables that are functions of existing variables
% \item select() picks variables based on their names.
% \item filter() picks cases based on their values.
% \item summarise() reduces multiple values down to a single summary.
% \item arrange() changes the ordering of the rows.
% \item group\verb|_|by() which allows you to perform any operation ``by group''.
% \end{itemize}
% This works beautifully with the pipe operator.
% \end{frame}
% \begin{frame}[fragile]
% \frametitle{Not Just for Fun: Balance of Payment Data}
% << highlight=T, eval=TRUE,message=F, warning=F >>=
% library(tidyverse)
% df<-read_csv("bop_flow2.csv") %>%
% {.$Value=as.numeric(.$Value)
% .}
% @
% Let us glimpse at the resulting table (only a few rows are shown and
% they are broken into two chunks)
% % df<-read_csv("bop_flow2.csv",col_types = cols(Value = "i"))
% \begin{table}[ht]
% \centering
% \scalebox{0.7}{
% \begin{tabular}{rlllll}
% \hline
% TIME & GEO & CURRENCY & NACE\_R2 & STK\_FLOW & STK\_FLOW\_LABEL \\
% \hline
% 2016 & EU28 & Million euro & TOTAL & ASS & Assets \\
% 2016 & EU28 & Million euro & TOTAL & ASS & Assets \\
% 2016 & EU28 & Million euro & TOTAL & ASS & Assets \\
% \hline
% \end{tabular}
% }
% \end{table}
% \begin{table}[ht]
% \centering
% \scalebox{0.7}{
% \begin{tabular}{rllllr}
% \hline
% TIME & ENTITY & FDI\_ITEM & FDI\_ITEM\_LABEL & PARTNER & Value \\
% \hline
% 2016 & TOTAL & DO\_\_D\_\_F & Direct investment abroad (DIA) & CH & NA \\
% 2016 & TOTAL & DO\_\_D\_\_F & Direct investment abroad (DIA) & TR & NA \\
% 2016 & TOTAL & DO\_\_D\_\_F & Direct investment abroad (DIA) & RU & NA \\
% \hline
% \end{tabular}
% }
% \end{table}
% \end{frame}
% \begin{frame}[fragile]
% \frametitle{dplyr Verbs in Action 1/5}
% In 2015, how many million euros did the EU28 (GEO) invest
% (FDI\verb|_|ITEM is DO\verb|_|\verb|_|D\verb|_|\verb|_|F; ENTITY is TOTAL) in manufacture
% (NACE\verb|_|R2 is C) in Japan (PARTNER is JP) as outward net foreign
% direct investment (STK\verb|_|FLOW is NO)?
% << highlight=T, eval=TRUE,message=F >>=
% library(tidyverse)
% manu_JP <- df %>%filter(TIME==2015, GEO=="EU28",
% STK_FLOW=="NO",FDI_ITEM=="DO__D__F",
% ENTITY=="TOTAL",PARTNER=="JP", NACE_R2=="C") %>%
% select(TIME, GEO, PARTNER, NACE_R2, Value)
% manu_JP
% @
% \end{frame}
% \begin{frame}[fragile]
% \frametitle{dplyr Verbs in Action 2/5}
% And the total FDI to the US for all years
% << highlight=T, eval=TRUE,message=F >>=
% library(tidyverse)
% FDI_US <- df %>%filter( GEO=="EU28",
% STK_FLOW=="NO",FDI_ITEM=="DO__D__F",
% ENTITY=="TOTAL",PARTNER =="US",NACE_R2=="FDI") %>%
% select(TIME, GEO, PARTNER, NACE_R2, Value)
% FDI_US
% @
% \end{frame}
% \begin{frame}[fragile]
% \frametitle{dplyr Verbs in Action 3/5}
% Add a new column with the FDI in dollars (1 EUR = 1.21 USD)
% << highlight=T, eval=TRUE,message=F >>=
% FDI_US <- df %>%filter( GEO=="EU28",
% STK_FLOW=="NO",FDI_ITEM=="DO__D__F",
% ENTITY=="TOTAL",PARTNER =="US",NACE_R2=="FDI") %>%
% select(TIME, GEO, PARTNER, NACE_R2, Value)%>%
% mutate(ValueUSD=Value*1.21)
% FDI_US
% @
% \end{frame}
% \begin{frame}[fragile]
% \frametitle{dplyr Verbs in Action 4/5}
% And if you want the average FDI to the US along the years
% << highlight=T, eval=TRUE,message=F >>=
% library(tidyverse)
% FDI_US_mean <- df %>%filter( GEO=="EU28",
% STK_FLOW=="NO",FDI_ITEM=="DO__D__F",
% ENTITY=="TOTAL",PARTNER =="US", NACE_R2=="FDI")%>%
% summarise(mean_FDI_to_US=mean(Value))
% FDI_US_mean
% @
% \end{frame}
% \begin{frame}[fragile]
% \frametitle{dplyr Verbs in Action 5/5}
% Now you want to do the same for US and India in one go
% \vspace*{-0.2cm}
% << highlight=T, eval=TRUE,message=F >>=
% library(tidyverse)
% FDI_US_IN <- df %>%filter( GEO=="EU28",
% STK_FLOW=="NO",FDI_ITEM=="DO__D__F",
% ENTITY=="TOTAL",PARTNER %in% c("US", "IN"),
% NACE_R2=="FDI")%>%
% group_by(PARTNER) %>%
% summarise(mean_FDI=mean(Value))
% FDI_US_IN
% @
% \end{frame}
% \begin{frame}[fragile]
% \frametitle{dplyr -- Final Thoughts}
% \begin{itemize}
% \item We barely scratched the surface of dplyr
% \item but we have already seen filter, selection of columns,
% creation of new columns and
% computing statistics on groups of variables
% \item thanks to the pipe operator, most of the code that you write
% is reusable and readable
% \item you do not worry about cells, indexes etc..., but you think
% more about the questions you want to pose to your data.
% \end{itemize}
% \end{frame}
\begin{frame}[fragile]
\frametitle{Tidy Data 1/2}
\begin{itemize}
\item Not all the ways in which you can organise your tabular data are created equal
to be manipulated with R.
\item It is strongly recommended working with \underline{tidy}
data sets in R (definition to follow).
\item Messy (i.e. not tidy) data sets are not at all rare or
implausible, but much harder to deal with in R (and not only R).
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Tidy Data 2/2}
The tidyverse is named after the tidy data format. In tidy data
\begin{enumerate}
\item Each variable forms a column.
\item Each observation forms a row.
\item Each type of observational unit forms a table.
\end{enumerate}
\end{frame}
\begin{frame}[fragile]
\frametitle{Benefits of Tidy Data Sets}
\begin{itemize}
\item Tidy data makes it easy for an analyst or a computer to extract needed
variables because it provides a standard way of structuring a
dataset.
\item Other way to put it: you do not need different strategies to extract different variables.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Example: City Temperatures [$^{\circ}\mathrm{C}$] During 4 Weeks}
\begin{table}
\parbox{.45\linewidth}{
\centering
\begin{tabular}{rrrr}
\hline
week & city\_A & city\_B & city\_C \\
\hline
1 & 14 & 18 & 23 \\
2 & 15 & 21 & 24 \\
3 & 12 & 25 & 23 \\
4 & 13 & 17 & 25 \\
\hline
\end{tabular}
\caption{Format A: variable temperature appears in 3 columns (column headers
city\_X
are values, not variable names).}
}
\hfill
\parbox{.45\linewidth}{
\centering
\begin{tabular}{lrrrr}
\hline
week & 1 & 2 & 3 & 4 \\
\hline
city\_A & 14 & 15 & 12 & 13 \\
city\_B & 18 & 21 & 25 & 17 \\
city\_C & 23 & 24 & 23 & 25 \\
\hline
\end{tabular}
\caption{Format B: variables temperature and week appear
simultaneously in 3 columns. Furthermore the ``week'' column contains the city names.}
}
\end{table}
\begin{itemize}
\item In both format A and B, rules 1 and 2 of the tidy data sets are violated.
\item The data set has 3 variables only: city, temperature and
week. Why do we see them along 4-5 columns?
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Tidy City Temperature Data Set}
Tidy version of the data set (3 columns for
3 variables)
\begin{table}[ht]
\centering
\begin{tabular}{rlr}
\hline
week & city & temperature \\
\hline
1 & A & 14 \\
1 & B & 18 \\
1 & C & 23 \\
2 & A & 15 \\
2 & B & 21 \\
2 & C & 24 \\
3 & A & 12 \\
3 & B & 25 \\
3 & C & 23 \\
4 & A & 13 \\
4 & B & 17 \\
4 & C & 25 \\
\hline
\end{tabular}
\end{table}
Many people do not like this format because of the many rows, but it
works like a charm with dplyr.
\end{frame}
% \begin{frame}[fragile]
% \frametitle{dplyr in Action 1/4}
% \underline{Summarise} the temperature column: calculate the mean
% temperature along all the weeks in all the cities
% << highlight=T, eval=TRUE,message=F >>=
% library(tidyverse)
% tempcity<-read_csv("tidy.csv")
% mean_temp <- tempcity %>%
% summarise(aver_temp=mean(temperature))
% mean_temp
% @
% \end{frame}
% \begin{frame}[fragile]
% \frametitle{dplyr in Action 2/4}
% \underline{Group} the data \underline{by} city and then calculate the
% mean temperature for every city.
% << highlight=T, eval=TRUE,message=F >>=
% library(tidyverse)
% mean_temp_city <- tempcity %>%
% group_by(city) %>%
% summarise(aver_temp=mean(temperature))
% mean_temp_city
% @
% \end{frame}
% \begin{frame}[fragile]
% \frametitle{dplyr in Action 3/4}
% \underline{Filter} the temperatures for all the cities for week 4.
% << highlight=T, eval=TRUE,message=F >>=
% temp_week_4 <- tempcity %>%
% filter(week==4)
% temp_week_4
% @
% \end{frame}
% \begin{frame}[fragile]
% \frametitle{dplyr in Action 4/4}
% \underline{Filter} the data for city A, \underline{mutate} the data
% set by creating the temperature in Fahrenheit, and \underline{select} all the
% columns apart from the temperature in Celsius.
% << highlight=T, eval=TRUE,message=F >>=
% city_A_farh <- tempcity %>%
% filter(city=="A") %>%
% mutate(Fahrenheit=temperature*1.8+32) %>%
% select(-temperature)
% city_A_farh
% @
% \end{frame}
\begin{frame}[fragile]
\frametitle{Tidying Messy Datasets}
Real data sets are often messy in every conceivable way, e.g.
\begin{itemize}
\item Column headers are values, not variable names.
\item Multiple variables are stored in one column.
\item Variables are stored in both rows and columns.
\item Multiple types of observational units are stored in the same table.
\item A single observational unit is stored in multiple tables.
\end{itemize}
Messy data sets are not evil per se, but they certainly do not go well
with the tidyverse tools. Tidying messy data sets is in itself a large topic; we'll focus only
on a couple of examples in the following.
\end{frame}
\begin{frame}[fragile]
\frametitle{Column Headers are Values, not Variable Names}
\begin{table}[ht]
\centering
\begin{tabular}{rrrr}
\hline
week & city\_A & city\_B & city\_C \\
\hline
1 & 14 & 18 & 23 \\
2 & 15 & 21 & 24 \\
3 & 12 & 25 & 23 \\
4 & 13 & 17 & 25 \\
\hline
\end{tabular}
\caption{Data in Format A}
\end{table}
This is a rather common situation. We tackle it in two steps.
\begin{itemize}
\item We realize that a common attribute of concern is spread out across columns. We want to reformat the data such that these common attributes are gathered together as a single variable.
\item dplyr has a verb ``gather()'' which takes multiple columns and collapse them into key-value pairs, duplicating all other columns as needed.
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Tidying the Data Set}
Here is the strategy to tidy the data set
\begin{enumerate}
\item we want to restructure the city names (city\_A, city\_B and city\_C) as an
individual variable. To this aim
\item we gather each city name within one \underline{new}
column variable and also gather the temperature values associated with each city in a second column variable.
\end{enumerate}
% We created a new column city which gathers all the city names and
% the associated temperature values.
\end{frame}
\begin{frame}[fragile]
\vspace*{-0.2cm}
\frametitle{Tidy Data Set}
\vspace*{-0.2cm}
<< highlight=F, eval=TRUE,message=F, echo=2 >>=
city_formatA <- read_csv("format_A.csv")
city_tidyA <-city_formatA %>%
gather(city, temperature, city_A:city_C)
city_tidyA
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Final Remarks}
Here is what we did
\begin{itemize}
\item We created the new key ``city'' which contains
all the city names and
\item we created the ``temperature''
value associated to every city name (so we have the key-value pair) and
\item we gathered all the columns labelled by a city name into our
key-value pair.
\end{itemize}
The resulting data set had 12 rows (4 weeks $\times$ 3 city
names). Now every row is an observation and the data set is tidy.
\end{frame}
\begin{frame}[fragile]
\frametitle{Another Example: Format B}
Conceptually similar to the previous case: column headers are now
another variable (week number instead of temperature).
\begin{table}
\begin{tabular}{lrrrr}
\hline
week & 1 & 2 & 3 & 4 \\
\hline
city\_A & 14 & 15 & 12 & 13 \\
city\_B & 18 & 21 & 25 & 17 \\
city\_C & 23 & 24 & 23 & 25 \\
\hline
\end{tabular}
\caption{Data in Format B}
\end{table}
\end{frame}
\begin{frame}[fragile]
\vspace*{-0.2cm}
\frametitle{Another Example: Format B}
\vspace*{-0.2cm}
<< highlight=F, eval=TRUE,message=F, echo=2 >>=
city_formatB <- read_csv("format_B.csv")
city_tidyB <-city_formatB %>%
gather(week_number, temperature, `1`:`4`)
city_tidyB
@
\end{frame}
\begin{frame}[fragile]
\frametitle{Almost There}
In format B the ``week'' column contains the city names, so that
column is mislabelled when we create the week-temperature key-value
pair. It is easy enough to fix it through the dplyr ``rename'' verb.
\end{frame}
\begin{frame}[fragile]
\frametitle{Finally There}
<< highlight=F, eval=TRUE,message=F >>=
city_tidyB %>% rename("city"="week")
@
\end{frame}
\end{document}