• R/O
  • SSH

コミット

タグ
未設定

よく使われているワード(クリックで追加)

javac++androidlinuxc#windowsobjective-ccocoa誰得qtpythonphprubygameguibathyscaphec計画中(planning stage)翻訳omegatframeworktwitterdomtestvb.netdirectxゲームエンジンbtronarduinopreviewer

コミットメタ情報

リビジョンb0f8a88cf65c483105bda59b5b44f7d9711392e8 (tree)
日時2021-02-19 23:47:03
作者Lorenzo Isella <lorenzo.isella@gmai...>
コミッターLorenzo Isella

ログメッセージ

I added some extra stats and I commented most of the code.

変更サマリ

差分

diff -r 5657114b3a2e -r b0f8a88cf65c R-codes/process_covid_tracker.R
--- a/R-codes/process_covid_tracker.R Wed Feb 17 22:48:55 2021 +0100
+++ b/R-codes/process_covid_tracker.R Fri Feb 19 15:47:03 2021 +0100
@@ -1,6 +1,6 @@
11 rm(list=ls())
22
3-## last saved on Time-stamp: "2021-02-15 11:06:30 lorenzo"
3+## last saved on Time-stamp: "2021-02-19 15:46:30 lorenzo"
44
55
66 library(tidyverse)
@@ -9,135 +9,159 @@
99 library(openxlsx)
1010 library(stringr)
1111
12-## source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R")
13-
14-remove_special_char <- function(x, new_pattern=""){
15-
16-## remove special characters from a column
17-
18-res <- str_replace_all(x, "[[:punct:]]", new_pattern)
19-
20-res <- str_replace_all(x, "[^[:alnum:]]", new_pattern)
21-
22-
23-res <- str_trim(res, side ="both")
24-
25-
26-return(res)
27-
28-}
29-
12+library(kableExtra)
3013
31-move_row <- function(df, ini_pos, fin_pos){
32-
33-ll <- nrow(df)
34-
35-row_pick <- slice(df, ini_pos)
36-
37- if (fin_pos=="last"){
38-
39- res <- df %>%
40- slice(-ini_pos) %>%
41- add_row(row_pick, .before = ll)
42-
43-
44-} else{
45-
46- res <- df %>%
47- slice(-ini_pos) %>%
48- add_row(row_pick, .before = fin_pos)
49-}
50-
51- return(res)
52-}
14+source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R")
5315
5416
5517
56-add_total <- function(x, pos=1, ...){
57- adorn_totals(x, ...) %>% as_tibble %>%
58- move_row(nrow(.), pos)
59-}
18+## pattern_to_na <- function(df, pattern){
19+
20+## res <- df %>% na_if(., pattern)
21+
22+## return(res)
23+
24+## }
25+
26+
27+## ### This function removes punctuation and special characters
28+
29+## remove_special_char <- function(x, new_pattern=""){
30+
31+## ## remove special characters from a column
32+
33+## res <- str_replace_all(x, "[[:punct:]]", new_pattern)
34+
35+## res <- str_replace_all(x, "[^[:alnum:]]", new_pattern)
36+
37+
38+## res <- str_trim(res, side ="both")
39+
40+
41+## return(res)
42+
43+## }
44+
45+
46+## ## this function moves a given row of a table to the desired position
47+
48+## move_row <- function(df, ini_pos, fin_pos){
49+
50+## ll <- nrow(df)
51+
52+## row_pick <- slice(df, ini_pos)
53+
54+## if (fin_pos=="last"){
55+
56+## res <- df %>%
57+## slice(-ini_pos) %>%
58+## add_row(row_pick, .before = ll)
59+
60+
61+## } else{
62+
63+## res <- df %>%
64+## slice(-ini_pos) %>%
65+## add_row(row_pick, .before = fin_pos)
66+## }
67+
68+## return(res)
69+## }
70+
71+
72+## ## this function adds a total row to a table
73+
74+## add_total <- function(x, pos=1, ...){
75+## adorn_totals(x, ...) %>% as_tibble %>%
76+## move_row(nrow(.), pos)
77+## }
6078
6179
6280
6381
64-## a function to remove duplicated columns (see http://bit.ly/2s0q6cC )
65-
66-rem_dupl_cols <- function(df){
82+## ## a function to remove duplicated columns (see http://bit.ly/2s0q6cC )
6783
68-res <- df[, !duplicated(t(df))]
84+## rem_dupl_cols <- function(df){
6985
70-return(res)
86+## res <- df[, !duplicated(t(df))]
7187
72-}
88+## return(res)
89+
90+## }
7391
7492
75-## a function to remove constant columns (see http://bit.ly/2s1qqaY
76-## and http://bit.ly/2s1Sb3q . The solution in the second link is better
77-## because it works also with factors and characters )
93+## ## a function to remove constant columns (see http://bit.ly/2s1qqaY
94+## ## and http://bit.ly/2s1Sb3q . The solution in the second link is better
95+## ## because it works also with factors and characters )
7896
79-rem_const_cols <- function(df){
80-## res <- df[,apply(df, 2, var, na.rm=TRUE) != 0]
97+## rem_const_cols <- function(df){
98+## ## res <- df[,apply(df, 2, var, na.rm=TRUE) != 0]
8199
82100
83-## res <- df[sapply(df, function(x) length(unique(na.omit(x)))) > 1]
101+## ## res <- df[sapply(df, function(x) length(unique(na.omit(x)))) > 1]
84102
85- res <- df %>%
86- select(where(~length(unique(na.omit(.x))) > 1))
103+## res <- df %>%
104+## select(where(~length(unique(na.omit(.x))) > 1))
87105
88106
89107
90-return(res)
108+## return(res)
91109
92-}
93-
110+## }
94111
95112
96113
97114
98-clean_data <- function(x){
115+## ## this function makes the column names machine-friendly and removes
116+## ## duplicated rows, duplicated columns and constant rows
99117
100- res <- x %>%
101- clean_names() %>%
102- remove_empty() %>%
103- distinct() %>%
104- rem_dupl_cols() %>%
105- rem_const_cols()
118+## clean_data <- function(x){
106119
107- return(res)
120+## res <- x %>%
121+## clean_names() %>%
122+## remove_empty() %>%
123+## distinct() %>%
124+## rem_dupl_cols() %>%
125+## rem_const_cols()
126+
127+## return(res)
108128
109129
110-}
130+## }
111131
112132
113133
114-na_to_pattern <- function(df, pattern){
134+## ## this function changes all the missing data in a table to the desired pattern
115135
116-res <- df %>% replace(., is.na(.), pattern)
136+## na_to_pattern <- function(df, pattern){
117137
118-return(res)
138+## res <- df %>% replace(., is.na(.), pattern)
139+
140+## return(res)
119141
120-}
142+## }
121143
122144
123145
124-save_excel <- function(output, fileName, sheetName="data", na_yes = TRUE,...){
125-tryCatch({
146+## ### This function is used to save the output of the calculations as an excel file
126147
127- wb <- loadWorkbook(fileName)
128- addWorksheet(wb = wb, sheet = sheetName)
129- writeData(wb = wb, sheet = sheetName, x = output, colNames = T, rowNames = F,
130- keepNA=na_yes ,...)
131- saveWorkbook(wb = wb, file = fileName, overwrite = T)
132-},
133-error = function(err){
148+## save_excel <- function(output, fileName, sheetName="data", na_yes = TRUE,...){
149+## tryCatch({
134150
135- wb <- createWorkbook(fileName)
136- addWorksheet(wb = wb, sheet = sheetName)
137- writeData(wb = wb, sheet = sheetName, x = output, colNames = T, rowNames = F, keepNA=na_yes , ... )
138- saveWorkbook(wb = wb, file = fileName, overwrite = T)
139-})
140-}
151+## wb <- loadWorkbook(fileName)
152+## addWorksheet(wb = wb, sheet = sheetName)
153+## writeData(wb = wb, sheet = sheetName, x = output, colNames = T, rowNames = F,
154+## keepNA=na_yes ,...)
155+## saveWorkbook(wb = wb, file = fileName, overwrite = T)
156+## },
157+## error = function(err){
158+
159+## wb <- createWorkbook(fileName)
160+## addWorksheet(wb = wb, sheet = sheetName)
161+## writeData(wb = wb, sheet = sheetName, x = output, colNames = T, rowNames = F, keepNA=na_yes , ... )
162+## saveWorkbook(wb = wb, file = fileName, overwrite = T)
163+## })
164+## }
141165
142166
143167
@@ -149,6 +173,10 @@
149173
150174
151175
176+## tf_list and tf_list2 are two lists of the same temporary framework
177+## chapters. They are written in a different way because they need to be
178+## looked for (grepped) in different places
179+
152180 tf_list <- c("TF1:", "TF2:", "TF3:", ## "TF4",
153181 ## "TF5",
154182 "TF6:", "TF7:", "TF8:",
@@ -161,33 +189,37 @@
161189 "TF9", "TF10", "TF11", "TF12")
162190
163191
192+
193+### import the raw data about the amendments
164194 df_am_ini <- read_csv("amendments.csv")
165195
166-
196+## import the raw data about the decisions
167197 df_dec_ini <- read_csv("decisions.csv", , locale = readr::locale(encoding = "latin1") )
168198
169199
170- ## read.csv("decisions.csv", fileEncoding="latin1")
171-
172200
173201 df_am <- df_am_ini %>%
174- clean_data()
202+ clean_data() ### clean the amendments data
175203
176204 df_dec <- df_dec_ini %>%
177- clean_data()
205+ clean_data() ### clean the decision data
178206
179-rep1_ini <- read_csv("report1_detailed.csv")
207+rep1_ini <- read_csv("report1_detailed.csv") ## import the raw data from the
208+## full report
180209
181210
182211
183212 rep1 <- rep1_ini %>%
184- clean_data()
213+ clean_data() ## clean the data of the full report
185214
186215 rep1_unique <- rep1 %>%
187- distinct(case_reference, .keep_all=T)
216+ distinct(case_reference, .keep_all=T) ## generates the equivalent of
217+## report 1 no duplicates in the excel tracker. This is a subset of the full report without duplicated case references.
188218
189219
190220
221+### budget calculation. The report without duplicated is broken down (grouped_by) member states (MS) and for each MS we calculate the sum of the confirmed budget. For each MS we also calculate the share of the total budget.
222+## A column with the totals is also added.
191223
192224 budget_test <- rep1_unique %>%
193225 group_by(member_state_2_letter_code) %>%
@@ -195,151 +227,16 @@
195227 ungroup %>%
196228 mutate(share=budget/sum(budget)) %>%
197229 add_total(nrow(.)+1)
198-
199-
200-
201-
202-decisions_test <- rep1_unique %>%
203- filter(!is.na(legal_basis))%>%
204- group_by(member_state_2_letter_code) %>%
205- summarise(n_decisions=n()) %>%
206- ungroup() %>%
207- arrange(member_state_2_letter_code)%>%
208- add_total(nrow(.)+1)
209-
210-
211-
212-count_am <- df_am %>%
213- group_by(member_state_of_amendment) %>%
214- summarise(n_amendments=length(unique(amendment_case_number))) %>%
215- ungroup%>%
216- add_total(nrow(.)+1)
217-
218-
219-
220-duration_calc <- rep1 %>%
221- filter(register_code %in% c("N", "PN"),
222- decided=="YES") %>%
223- select(case_reference, member_state_2_letter_code, average_duration,
224- register_code, legal_basis, type_of_measure )%>%
225- pivot_wider(names_from="register_code", values_from="average_duration") %>%
226- ## na_to_pattern(0) %>%
227- mutate(type=if_else(is.na(PN), "N", "P+N")) %>%
228- rowwise() %>%
229- mutate(total_duration=sum(N,PN, na.rm=T)) %>%
230- ungroup
231-
232-
233-## save_excel(duration_calc, "duration_raw_data.xlsx")
234-
235-
236-
237-
238-duration_stat_ms <- duration_calc %>%
239- group_by(member_state_2_letter_code) %>%
240- summarise(mean_duration=mean(total_duration, na.rm=T),
241- median_duration=median(total_duration, na.rm=T),
242- mean_PN_duration=mean(PN, na.rm=T),
243- median_PN_duration=median(PN, na.rm=T),
244- mean_N_duration=mean(N, na.rm=T),
245- median_N_duration=median(N, na.rm=T),
246- number_data_points_N=n()-sum(is.na(N)),
247- number_data_points_PN=n()-sum(is.na(PN))
248- )%>%
249- ungroup
250-
251-## save_excel(duration_stat_ms, "duration_by_MS.xlsx")
252-
253230
254231
255-duration_stat_eu <- duration_calc %>%
256- ## group_by(legal_basis) %>%
257- summarise(mean_duration=mean(total_duration, na.rm=T),
258- median_duration=median(total_duration, na.rm=T),
259- mean_PN_duration=mean(PN, na.rm=T),
260- median_PN_duration=median(PN, na.rm=T),
261- mean_N_duration=mean(N, na.rm=T),
262- median_N_duration=median(N, na.rm=T),
263- number_data_points_N=n()-sum(is.na(N)),
264- number_data_points_PN=n()-sum(is.na(PN))) %>%
265- ungroup
266-
267-## save_excel(duration_stat_eu, "duration_all_eu.xlsx")
268-
269-duration_stat_basis <- duration_calc %>%
270- group_by(legal_basis) %>%
271- summarise(mean_duration=mean(total_duration, na.rm=T),
272- median_duration=median(total_duration, na.rm=T),
273- mean_PN_duration=mean(PN, na.rm=T),
274- median_PN_duration=median(PN, na.rm=T),
275- mean_N_duration=mean(N, na.rm=T),
276- median_N_duration=median(N, na.rm=T),
277- number_data_points_N=n()-sum(is.na(N)),
278- number_data_points_PN=n()-sum(is.na(PN)))%>%
279- ungroup
280-
281-## save_excel(duration_stat_basis, "duration_by_legal_basis.xlsx")
282-
283-
284-
285-
286-duration_stat_basis_ms <- duration_calc %>%
287- group_by(member_state_2_letter_code,legal_basis) %>%
288- summarise(mean_duration=mean(total_duration, na.rm=T),
289- median_duration=median(total_duration, na.rm=T),
290- mean_PN_duration=mean(PN, na.rm=T),
291- median_PN_duration=median(PN, na.rm=T),
292- mean_N_duration=mean(N, na.rm=T),
293- median_N_duration=median(N, na.rm=T),
294- number_data_points_N=n()-sum(is.na(N)),
295- number_data_points_PN=n()-sum(is.na(PN)))%>%
296- ungroup %>%
297- arrange(member_state_2_letter_code)
298-
299-
300-
301-
302-
303-
304-
305-duration_stat_measure <- duration_calc %>%
306- group_by(type_of_measure) %>%
307- summarise(mean_duration=mean(total_duration, na.rm=T),
308- median_duration=median(total_duration, na.rm=T),
309- mean_PN_duration=mean(PN, na.rm=T),
310- median_PN_duration=median(PN, na.rm=T),
311- mean_N_duration=mean(N, na.rm=T),
312- median_N_duration=median(N, na.rm=T),
313- number_data_points_N=n()-sum(is.na(N)),
314- number_data_points_PN=n()-sum(is.na(PN)))%>%
315- ungroup
316-
317-
318-
319-
320-duration_stat_measure_ms <- duration_calc %>%
321- group_by(member_state_2_letter_code,type_of_measure) %>%
322- summarise(mean_duration=mean(total_duration, na.rm=T),
323- median_duration=median(total_duration, na.rm=T),
324- mean_PN_duration=mean(PN, na.rm=T),
325- median_PN_duration=median(PN, na.rm=T),
326- mean_N_duration=mean(N, na.rm=T),
327- median_N_duration=median(N, na.rm=T),
328- number_data_points_N=n()-sum(is.na(N)),
329- number_data_points_PN=n()-sum(is.na(PN)))%>%
330- ungroup %>%
331- arrange(member_state_2_letter_code)
332-
333-
334-
335-
336-## save_excel(duration_stat_measure, "duration_by_legal_measure.xlsx")
337232
338233
339234 aid_type <- tibble(code=c("AH", "IA", "S"),
340235 name=c("Ad hoc", "Individual application within a scheme",
341236 "Schemes"))
342237
238+### budget broken by type
239+
343240 budget_type <- rep1_unique %>%
344241 group_by(## member_state_2_letter_code,
345242 case_type_code) %>%
@@ -352,6 +249,7 @@
352249 add_total(nrow(.)+1)
353250
354251
252+### budget broken by type and MS
355253
356254 budget_type_ms <- rep1_unique %>%
357255 group_by( member_state_2_letter_code,
@@ -370,7 +268,7 @@
370268
371269
372270
373-
271+#### budget broken by measure
374272
375273
376274 budget_measure <- rep1_unique %>%
@@ -384,7 +282,7 @@
384282
385283
386284
387-
285+#### budget broken by measure and MS
388286
389287 budget_measure_ms <- rep1_unique %>%
390288 group_by( member_state_2_letter_code,
@@ -405,9 +303,107 @@
405303
406304
407305
306+### budget broken by beneficiary sector
408307
409308
410309
310+budget_by_beneficiary_sector <- rep1_unique %>%
311+ group_by(beneficary_sector) %>%
312+ summarise(budget=sum(confirmed_budgets, na.rm=T)) %>%
313+ ungroup %>%
314+ filter(complete.cases(.)) %>%
315+ mutate(share=budget/sum(budget)) %>%
316+ arrange(desc(budget))%>%
317+ add_total(nrow(.)+1)
318+
319+
320+### budget broken by MS and beneficiary sector
321+
322+budget_by_beneficiary_sector_ms <- rep1_unique %>%
323+ group_by(member_state_2_letter_code,beneficary_sector) %>%
324+ summarise(budget=sum(confirmed_budgets, na.rm=T)) %>%
325+ na.omit %>%
326+ mutate(share=budget/sum(budget)) %>%
327+ group_modify(~ .x %>%
328+ adorn_totals("row")) %>%
329+ ungroup%>%
330+ complete(member_state_2_letter_code,beneficary_sector) %>%
331+ na_to_pattern(0)
332+
333+
334+
335+
336+### budget broken by beneficiary type
337+
338+
339+budget_by_beneficiary_type <- rep1_unique %>%
340+ group_by(beneficary_type) %>%
341+ summarise(budget=sum(confirmed_budgets, na.rm=T)) %>%
342+ ungroup %>%
343+ filter(complete.cases(.)) %>%
344+ mutate(share=budget/sum(budget)) %>%
345+ arrange(desc(budget))%>%
346+ add_total(nrow(.)+1)
347+
348+
349+
350+### budget broken by beneficiary type and MS
351+
352+
353+budget_by_beneficiary_type_ms <- rep1_unique %>%
354+ group_by(member_state_2_letter_code,beneficary_type) %>%
355+ summarise(budget=sum(confirmed_budgets, na.rm=T)) %>%
356+ na.omit %>%
357+ mutate(share=budget/sum(budget)) %>%
358+ group_modify(~ .x %>%
359+ adorn_totals("row")) %>%
360+ ungroup%>%
361+ complete(member_state_2_letter_code,beneficary_type) %>%
362+ na_to_pattern(0)
363+
364+
365+
366+
367+
368+
369+
370+
371+
372+
373+
374+
375+
376+
377+
378+
379+
380+
381+
382+
383+
384+
385+
386+
387+
388+
389+
390+### statistics on the number of decisions.
391+## from the report file without duplicates, we remove those rows for which
392+## the legal basis is missing. We break down the remaining data by MS and
393+## we count the number of decisions for each MS (just the number of lines for each MS) and we add a total row
394+
395+decisions_test <- rep1_unique %>%
396+ filter(!is.na(legal_basis))%>%
397+ group_by(member_state_2_letter_code) %>%
398+ summarise(n_decisions=n()) %>%
399+ ungroup() %>%
400+ arrange(member_state_2_letter_code)%>%
401+ add_total(nrow(.)+1)
402+
403+
404+
405+### number of decisions by legal_basis
406+
411407 decisions_eu <- rep1_unique %>%
412408 filter(!is.na(legal_basis))%>%
413409 group_by(legal_basis) %>%
@@ -418,6 +414,8 @@
418414 add_total(nrow(.)+1)
419415
420416
417+### number of decisions by legal_basis and MS
418+
421419
422420 decisions_eu_ms <- rep1_unique %>%
423421 filter(!is.na(legal_basis))%>%
@@ -436,6 +434,300 @@
436434
437435
438436
437+
438+## statistics on the number of amendements. We use the first two columns of
439+## the amendments file. We break the amendment data by MS and we count the number of amendments for each MS as the number of *unique* amendment case numbers (in case some of them are repeated)
440+
441+
442+count_am <- df_am %>%
443+ group_by(member_state_of_amendment) %>%
444+ summarise(n_amendments=length(unique(amendment_case_number))) %>%
445+ ungroup%>%
446+ add_total(nrow(.)+1)
447+
448+
449+### this the fundamental table for the duration calculations
450+## we consider the full report and we filter the register code to be
451+## either N of PN and the case needs to have been decided (i.e. decided = YES)
452+## we select the columns case_reference, member_state_2_letter_code,
453+## average_duration, register_code, legal_basis, type_of_measure
454+## and we pivot the table.
455+
456+## Finally we obtain a table with the structure
457+
458+## case_reference member_state_2_… legal_basis type_of_measure N PN type total_duration
459+
460+## in which we have the duration of the notification and prenotification phase of all the decided cases
461+
462+duration_calc <- rep1 %>%
463+ filter(register_code %in% c("N", "PN"),
464+ decided=="YES") %>%
465+ select(case_reference, member_state_2_letter_code, average_duration,
466+ register_code, legal_basis, type_of_measure )%>%
467+ pivot_wider(names_from="register_code", values_from="average_duration") %>%
468+ ## na_to_pattern(0) %>%
469+ mutate(type=if_else(is.na(PN), "N", "P+N")) %>%
470+ rowwise() %>%
471+ mutate(total_duration=sum(N,PN, na.rm=T)) %>%
472+ ungroup
473+
474+
475+## save_excel(duration_calc, "duration_decisions_raw_data.xlsx")
476+
477+
478+
479+
480+
481+
482+
483+
484+## we break the duration_calc data by MS and we calculate statistics
485+## on the duration of the N, PN and combined N+PN process.
486+
487+duration_stat_ms <- duration_calc %>%
488+ group_by(member_state_2_letter_code) %>%
489+ summarise(mean_duration=mean(total_duration, na.rm=T),
490+ median_duration=median(total_duration, na.rm=T),
491+ mean_PN_duration=mean(PN, na.rm=T),
492+ median_PN_duration=median(PN, na.rm=T),
493+ mean_N_duration=mean(N, na.rm=T),
494+ median_N_duration=median(N, na.rm=T),
495+ number_data_points_N=n()-sum(is.na(N)),
496+ number_data_points_PN=n()-sum(is.na(PN))
497+ )%>%
498+ ungroup
499+
500+## save_excel(duration_stat_ms, "duration_by_MS.xlsx")
501+
502+
503+### As above, but without breaking the data by MS ---> we calculate the stats for the whole EU.
504+
505+duration_stat_eu <- duration_calc %>%
506+ ## group_by(legal_basis) %>%
507+ summarise(mean_duration=mean(total_duration, na.rm=T),
508+ median_duration=median(total_duration, na.rm=T),
509+ mean_PN_duration=mean(PN, na.rm=T),
510+ median_PN_duration=median(PN, na.rm=T),
511+ mean_N_duration=mean(N, na.rm=T),
512+ median_N_duration=median(N, na.rm=T),
513+ number_data_points_N=n()-sum(is.na(N)),
514+ number_data_points_PN=n()-sum(is.na(PN))) %>%
515+ ungroup
516+
517+## save_excel(duration_stat_eu, "duration_all_eu.xlsx")
518+
519+
520+#### Now the duration statistics is calculated by breaking the data according
521+## to the legal basis
522+
523+duration_stat_basis <- duration_calc %>%
524+ group_by(legal_basis) %>%
525+ summarise(mean_duration=mean(total_duration, na.rm=T),
526+ median_duration=median(total_duration, na.rm=T),
527+ mean_PN_duration=mean(PN, na.rm=T),
528+ median_PN_duration=median(PN, na.rm=T),
529+ mean_N_duration=mean(N, na.rm=T),
530+ median_N_duration=median(N, na.rm=T),
531+ number_data_points_N=n()-sum(is.na(N)),
532+ number_data_points_PN=n()-sum(is.na(PN)))%>%
533+ ungroup
534+
535+## save_excel(duration_stat_basis, "duration_by_legal_basis.xlsx")
536+
537+
538+## Now the data is broken both by legal basis and by duration statistics.
539+
540+duration_stat_basis_ms <- duration_calc %>%
541+ group_by(member_state_2_letter_code,legal_basis) %>%
542+ summarise(mean_duration=mean(total_duration, na.rm=T),
543+ median_duration=median(total_duration, na.rm=T),
544+ mean_PN_duration=mean(PN, na.rm=T),
545+ median_PN_duration=median(PN, na.rm=T),
546+ mean_N_duration=mean(N, na.rm=T),
547+ median_N_duration=median(N, na.rm=T),
548+ number_data_points_N=n()-sum(is.na(N)),
549+ number_data_points_PN=n()-sum(is.na(PN)))%>%
550+ ungroup %>%
551+ arrange(member_state_2_letter_code)
552+
553+
554+
555+
556+
557+#### now the duration statistics is broken by type of measure.
558+
559+duration_stat_measure <- duration_calc %>%
560+ group_by(type_of_measure) %>%
561+ summarise(mean_duration=mean(total_duration, na.rm=T),
562+ median_duration=median(total_duration, na.rm=T),
563+ mean_PN_duration=mean(PN, na.rm=T),
564+ median_PN_duration=median(PN, na.rm=T),
565+ mean_N_duration=mean(N, na.rm=T),
566+ median_N_duration=median(N, na.rm=T),
567+ number_data_points_N=n()-sum(is.na(N)),
568+ number_data_points_PN=n()-sum(is.na(PN)))%>%
569+ ungroup
570+
571+
572+### and now it is also further broken by MS.
573+
574+duration_stat_measure_ms <- duration_calc %>%
575+ group_by(member_state_2_letter_code,type_of_measure) %>%
576+ summarise(mean_duration=mean(total_duration, na.rm=T),
577+ median_duration=median(total_duration, na.rm=T),
578+ mean_PN_duration=mean(PN, na.rm=T),
579+ median_PN_duration=median(PN, na.rm=T),
580+ mean_N_duration=mean(N, na.rm=T),
581+ median_N_duration=median(N, na.rm=T),
582+ number_data_points_N=n()-sum(is.na(N)),
583+ number_data_points_PN=n()-sum(is.na(PN)))%>%
584+ ungroup %>%
585+ arrange(member_state_2_letter_code)
586+
587+
588+
589+
590+
591+
592+
593+
594+
595+
596+
597+
598+
599+
600+
601+
602+
603+
604+
605+
606+#### I keep only some columns of the duration calc table and I rename them
607+duration_calc_restricted <- duration_calc %>%
608+ select(case_reference,member_state_2_letter_code, N, PN) %>%
609+ na_to_pattern(0) %>%
610+ mutate(total_duration=N+PN) %>%
611+ select(-c(N, PN)) %>%
612+ rename("MS"="member_state_2_letter_code") %>%
613+ mutate(type="decision")
614+
615+
616+
617+
618+###file with the raw data on the duration of the amendments.
619+### it is important to know that when a PN notification has a duration of zero,
620+## it actually means that it had never taken place and so the zeros should be
621+## treated as missing data.
622+
623+## I also keep only the distinct amendment numbers to avoid double-counting
624+## the amendments.
625+
626+duration_calc_am <- df_am %>%
627+ distinct(amendment_case_number, .keep_all=T) %>%
628+ select(case_number_key , amendment_case_number,member_state_of_amendment,
629+ duration_of_n_phase_calendar_days, duration_of_pn_phase_calendar_days,total_duration_pn_n_calendar_days
630+ ) %>%
631+ pattern_to_na(0) %>%
632+ mutate(is_PN_plus_N=if_else(!is.na(duration_of_pn_phase_calendar_days),
633+ "yes", "no"))
634+
635+
636+### I now select only some columns of the duration_calc table
637+duration_calc_am_restricted <- duration_calc_am %>%
638+ select(case_number_key,member_state_of_amendment,total_duration_pn_n_calendar_days ) %>%
639+ rename("case_reference"="case_number_key",
640+ "MS"="member_state_of_amendment",
641+ "total_duration"="total_duration_pn_n_calendar_days") %>%
642+ mutate(type="amendment")
643+
644+
645+
646+#### Here I create a table where for every case I list the time to take a decision and the duration of each amendment process.
647+
648+duration_am_plus_decisions <- bind_rows(duration_calc_am_restricted,
649+ duration_calc_restricted) %>%
650+ arrange(MS, case_reference)
651+
652+save_excel(duration_am_plus_decisions, "duration_decisions_amendments_raw_data.xlsx")
653+
654+
655+## I calculate some statistics by MS on the collected data on durations and decisions.
656+
657+duration_am_plus_decisions_ms <- duration_am_plus_decisions %>%
658+ group_by(MS) %>%
659+ summarise(mean_duration=mean(total_duration),
660+ median_duration=median(total_duration)) %>%
661+ ungroup
662+
663+
664+## and now the same stats at the EU level.
665+duration_am_plus_decisions_eu <- duration_am_plus_decisions %>%
666+ ## group_by(MS) %>%
667+ summarise(mean_duration=mean(total_duration),
668+ median_duration=median(total_duration)) ## %>%
669+ ## ungroup
670+
671+
672+
673+### I now calculate the statistics on the duration of the amendments process
674+## at the MS level
675+
676+duration_stat_am_ms <- duration_calc_am %>%
677+ group_by(member_state_of_amendment) %>%
678+ summarise(mean_duration=mean(total_duration_pn_n_calendar_days, na.rm=T),
679+ median_duration=median(total_duration_pn_n_calendar_days, na.rm=T),
680+ mean_PN_duration=mean(duration_of_pn_phase_calendar_days ,
681+ na.rm=T),
682+ median_PN_duration=median(duration_of_pn_phase_calendar_days ,
683+ na.rm=T),
684+ mean_N_duration=mean(duration_of_n_phase_calendar_days, na.rm=T),
685+ median_N_duration=median(duration_of_n_phase_calendar_days, na.rm=T),
686+ number_data_points_N=n()-sum(is.na(duration_of_n_phase_calendar_days)),
687+ number_data_points_PN=n()-sum(is.na(duration_of_pn_phase_calendar_days))
688+ )%>%
689+ ungroup
690+
691+
692+
693+
694+## and now the same stats at the EU level.
695+
696+duration_stat_am_eu <- duration_calc_am %>%
697+ ## group_by(member_state_of_amendment) %>%
698+ summarise(mean_duration=mean(total_duration_pn_n_calendar_days, na.rm=T),
699+ median_duration=median(total_duration_pn_n_calendar_days, na.rm=T),
700+ mean_PN_duration=mean(duration_of_pn_phase_calendar_days ,
701+ na.rm=T),
702+ median_PN_duration=median(duration_of_pn_phase_calendar_days ,
703+ na.rm=T),
704+ mean_N_duration=mean(duration_of_n_phase_calendar_days, na.rm=T),
705+ median_N_duration=median(duration_of_n_phase_calendar_days, na.rm=T),
706+ number_data_points_N=n()-sum(is.na(duration_of_n_phase_calendar_days)),
707+ number_data_points_PN=n()-sum(is.na(duration_of_pn_phase_calendar_days))
708+ )## %>%
709+ ## ungroup
710+
711+
712+
713+
714+
715+
716+
717+
718+## save_excel(duration_stat_measure, "duration_by_legal_measure.xlsx")
719+
720+
721+
722+
723+
724+
725+
726+
727+
728+
729+### This time we just count the number of measures by legal basis
730+
439731 measures_covid <- rep1_unique %>%
440732 filter(decided=="YES", !is.na(legal_basis))%>%
441733 group_by(legal_basis) %>%
@@ -445,7 +737,7 @@
445737 mutate(share=n_measures/sum(n_measures))%>%
446738 add_total(nrow(.)+1)
447739
448-
740+### now number of measures by legal basis and MS
449741
450742 measures_covid_ms <- rep1_unique %>%
451743 filter(decided=="YES", !is.na(legal_basis))%>%
@@ -464,6 +756,7 @@
464756
465757
466758
759+### Statistics on SA cases under assessment
467760
468761 sa_n_pn <- rep1_unique %>%
469762 ## filter(decided=="YES", !is.na(legal_basis),
@@ -476,7 +769,7 @@
476769 add_total(nrow(.)+1)
477770
478771
479-
772+### as above, but also broken by MS
480773 sa_n_pn_ms <- rep1_unique %>%
481774 ## filter(decided=="YES", !is.na(legal_basis),
482775 ## register_code %in% c("N", "PN"))%>%
@@ -494,7 +787,7 @@
494787
495788
496789
497-
790+### and this time broken by Unit dealing with them
498791 sa_n_pn_unit <- rep1_unique %>%
499792 ## filter(decided=="YES", !is.na(legal_basis),
500793 ## register_code %in% c("N", "PN"))%>%
@@ -509,6 +802,7 @@
509802
510803
511804
805+### now broken by both unit and MS
512806
513807 sa_n_pn_unit_ms <- rep1_unique %>%
514808 ## filter(decided=="YES", !is.na(legal_basis),
@@ -529,65 +823,9 @@
529823
530824
531825
532-
533-
534-budget_by_beneficiary_sector <- rep1_unique %>%
535- group_by(beneficary_sector) %>%
536- summarise(budget=sum(confirmed_budgets, na.rm=T)) %>%
537- ungroup %>%
538- filter(complete.cases(.)) %>%
539- mutate(share=budget/sum(budget)) %>%
540- arrange(desc(budget))%>%
541- add_total(nrow(.)+1)
542-
543-
544-
545-
546-budget_by_beneficiary_sector_ms <- rep1_unique %>%
547- group_by(member_state_2_letter_code,beneficary_sector) %>%
548- summarise(budget=sum(confirmed_budgets, na.rm=T)) %>%
549- na.omit %>%
550- mutate(share=budget/sum(budget)) %>%
551- group_modify(~ .x %>%
552- adorn_totals("row")) %>%
553- ungroup%>%
554- complete(member_state_2_letter_code,beneficary_sector) %>%
555- na_to_pattern(0)
556-
557-
558-
559-
560-
561-
562-
563-budget_by_beneficiary_type <- rep1_unique %>%
564- group_by(beneficary_type) %>%
565- summarise(budget=sum(confirmed_budgets, na.rm=T)) %>%
566- ungroup %>%
567- filter(complete.cases(.)) %>%
568- mutate(share=budget/sum(budget)) %>%
569- arrange(desc(budget))%>%
570- add_total(nrow(.)+1)
571-
572-
573-
574-
575-
576-budget_by_beneficiary_type_ms <- rep1_unique %>%
577- group_by(member_state_2_letter_code,beneficary_type) %>%
578- summarise(budget=sum(confirmed_budgets, na.rm=T)) %>%
579- na.omit %>%
580- mutate(share=budget/sum(budget)) %>%
581- group_modify(~ .x %>%
582- adorn_totals("row")) %>%
583- ungroup%>%
584- complete(member_state_2_letter_code,beneficary_type) %>%
585- na_to_pattern(0)
586-
587-
588-
589-
590-
826+### from the rep1_unique table, we filter the decided cases with procedure
827+## either N or PN. then we collapse all the text in type_of_measure column and we count how many times we find there the words in tf_list (i.e. the TF chapters)
828+## we want to track.
591829
592830 chapters_stat_test <- rep1_unique %>%
593831 filter(decided=="YES",
@@ -599,18 +837,19 @@
599837 mutate(share=value/sum(value))%>%
600838 mutate(TF_chapter=tf_list)
601839
602-## test1 <- rep1_unique %>%
603-## filter(decided=="YES",
604-## register_code %in% c("N", "PN"))%>%
605-## filter(grepl("TF3",type_of_measure) )
606840
607-## test1bis <- rep1_unique %>%
608-## filter(case_reference=="SA.57574")
841+### The calculation above is correct but it does not allow one to see exactly
842+## to which TF chapter each case is associated to.
843+
609844
610845 tf <- rep1_unique %>%
611846 filter(decided=="YES",
612- register_code %in% c("N", "PN"))
613-
847+ register_code %in% c("N", "PN")) ### I again take the decided
848+ ### case either N or PN as register_code
849+
850+### here I use map_df to iterate the filtering of the tf dataframe according
851+## to the tf_list of chapters
852+
614853 tf_all <- map_df(tf_list, function(x) {filter(tf,grepl(x, type_of_measure)) %>%
615854 mutate(TF=x) }) %>%
616855 mutate(TF=remove_special_char(TF)) %>%
@@ -623,6 +862,11 @@
623862 ## filter(grepl("TF1", type_of_measure))
624863
625864
865+### I need to treat separately the cases associated to more than 3 chapters
866+## because they are stored somewhere else. These calculations are along
867+## the lines of chapter_stat_test, but I need tf_list2 because the TF chapters
868+## are written a bit differently.
869+
626870 chapters_stat_extra <- df_dec %>%
627871 pull(specify_tf_others_more_than_three_sections_combined) %>%
628872 paste0(collapse = " ") %>%
@@ -631,6 +875,7 @@
631875 mutate(share=value/sum(value))%>%
632876 mutate(TF_chapter=tf_list)
633877
878+### and once again, if I want to have the association between case and TF chapter, I run calculations similar to those for tf_all
634879
635880 tf_all2 <- map_df(tf_list2, function(x) {filter(df_dec,
636881 grepl(x, specify_tf_others_more_than_three_sections_combined)) %>%
@@ -640,14 +885,17 @@
640885 rename("description"="specify_tf_others_more_than_three_sections_combined")
641886
642887
888+### and now I have the detailed situation for all cases and chapters
889+
643890 all_chapter_cases <- bind_rows(tf_all, tf_all2)
644891
645-## save_excel(all_chapter_cases, "chapters_detailed.xlsx")
646-
647892
648893
649894
650895
896+#### If I just need the statistics on all the TF chapters, I can add the count
897+## data on chapter usage from chapter_stat_test and chapter_stat_extra
898+
651899
652900 chapters_stat_fin <- chapters_stat_test %>%
653901 select(TF_chapter, value) %>%
@@ -657,6 +905,8 @@
657905 mutate(TF_chapter=remove_special_char(TF_chapter))
658906
659907
908+#### I can finally use the detailed statistics in all_chapter cases
909+### to count the number of TF chapter per MS.
660910
661911 chapters_stat_fin_ms <- all_chapter_cases %>%
662912 tabyl(member_state_code, TF) %>%