リビジョン | b0f8a88cf65c483105bda59b5b44f7d9711392e8 (tree) |
---|---|
日時 | 2021-02-19 23:47:03 |
作者 | Lorenzo Isella <lorenzo.isella@gmai...> |
コミッター | Lorenzo Isella |
I added some extra stats and I commented most of the code.
@@ -1,6 +1,6 @@ | ||
1 | 1 | rm(list=ls()) |
2 | 2 | |
3 | -## last saved on Time-stamp: "2021-02-15 11:06:30 lorenzo" | |
3 | +## last saved on Time-stamp: "2021-02-19 15:46:30 lorenzo" | |
4 | 4 | |
5 | 5 | |
6 | 6 | library(tidyverse) |
@@ -9,135 +9,159 @@ | ||
9 | 9 | library(openxlsx) |
10 | 10 | library(stringr) |
11 | 11 | |
12 | -## source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R") | |
13 | - | |
14 | -remove_special_char <- function(x, new_pattern=""){ | |
15 | - | |
16 | -## remove special characters from a column | |
17 | - | |
18 | -res <- str_replace_all(x, "[[:punct:]]", new_pattern) | |
19 | - | |
20 | -res <- str_replace_all(x, "[^[:alnum:]]", new_pattern) | |
21 | - | |
22 | - | |
23 | -res <- str_trim(res, side ="both") | |
24 | - | |
25 | - | |
26 | -return(res) | |
27 | - | |
28 | -} | |
29 | - | |
12 | +library(kableExtra) | |
30 | 13 | |
31 | -move_row <- function(df, ini_pos, fin_pos){ | |
32 | - | |
33 | -ll <- nrow(df) | |
34 | - | |
35 | -row_pick <- slice(df, ini_pos) | |
36 | - | |
37 | - if (fin_pos=="last"){ | |
38 | - | |
39 | - res <- df %>% | |
40 | - slice(-ini_pos) %>% | |
41 | - add_row(row_pick, .before = ll) | |
42 | - | |
43 | - | |
44 | -} else{ | |
45 | - | |
46 | - res <- df %>% | |
47 | - slice(-ini_pos) %>% | |
48 | - add_row(row_pick, .before = fin_pos) | |
49 | -} | |
50 | - | |
51 | - return(res) | |
52 | -} | |
14 | +source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R") | |
53 | 15 | |
54 | 16 | |
55 | 17 | |
56 | -add_total <- function(x, pos=1, ...){ | |
57 | - adorn_totals(x, ...) %>% as_tibble %>% | |
58 | - move_row(nrow(.), pos) | |
59 | -} | |
18 | +## pattern_to_na <- function(df, pattern){ | |
19 | + | |
20 | +## res <- df %>% na_if(., pattern) | |
21 | + | |
22 | +## return(res) | |
23 | + | |
24 | +## } | |
25 | + | |
26 | + | |
27 | +## ### This function removes punctuation and special characters | |
28 | + | |
29 | +## remove_special_char <- function(x, new_pattern=""){ | |
30 | + | |
31 | +## ## remove special characters from a column | |
32 | + | |
33 | +## res <- str_replace_all(x, "[[:punct:]]", new_pattern) | |
34 | + | |
35 | +## res <- str_replace_all(x, "[^[:alnum:]]", new_pattern) | |
36 | + | |
37 | + | |
38 | +## res <- str_trim(res, side ="both") | |
39 | + | |
40 | + | |
41 | +## return(res) | |
42 | + | |
43 | +## } | |
44 | + | |
45 | + | |
46 | +## ## this function moves a given row of a table to the desired position | |
47 | + | |
48 | +## move_row <- function(df, ini_pos, fin_pos){ | |
49 | + | |
50 | +## ll <- nrow(df) | |
51 | + | |
52 | +## row_pick <- slice(df, ini_pos) | |
53 | + | |
54 | +## if (fin_pos=="last"){ | |
55 | + | |
56 | +## res <- df %>% | |
57 | +## slice(-ini_pos) %>% | |
58 | +## add_row(row_pick, .before = ll) | |
59 | + | |
60 | + | |
61 | +## } else{ | |
62 | + | |
63 | +## res <- df %>% | |
64 | +## slice(-ini_pos) %>% | |
65 | +## add_row(row_pick, .before = fin_pos) | |
66 | +## } | |
67 | + | |
68 | +## return(res) | |
69 | +## } | |
70 | + | |
71 | + | |
72 | +## ## this function adds a total row to a table | |
73 | + | |
74 | +## add_total <- function(x, pos=1, ...){ | |
75 | +## adorn_totals(x, ...) %>% as_tibble %>% | |
76 | +## move_row(nrow(.), pos) | |
77 | +## } | |
60 | 78 | |
61 | 79 | |
62 | 80 | |
63 | 81 | |
64 | -## a function to remove duplicated columns (see http://bit.ly/2s0q6cC ) | |
65 | - | |
66 | -rem_dupl_cols <- function(df){ | |
82 | +## ## a function to remove duplicated columns (see http://bit.ly/2s0q6cC ) | |
67 | 83 | |
68 | -res <- df[, !duplicated(t(df))] | |
84 | +## rem_dupl_cols <- function(df){ | |
69 | 85 | |
70 | -return(res) | |
86 | +## res <- df[, !duplicated(t(df))] | |
71 | 87 | |
72 | -} | |
88 | +## return(res) | |
89 | + | |
90 | +## } | |
73 | 91 | |
74 | 92 | |
75 | -## a function to remove constant columns (see http://bit.ly/2s1qqaY | |
76 | -## and http://bit.ly/2s1Sb3q . The solution in the second link is better | |
77 | -## because it works also with factors and characters ) | |
93 | +## ## a function to remove constant columns (see http://bit.ly/2s1qqaY | |
94 | +## ## and http://bit.ly/2s1Sb3q . The solution in the second link is better | |
95 | +## ## because it works also with factors and characters ) | |
78 | 96 | |
79 | -rem_const_cols <- function(df){ | |
80 | -## res <- df[,apply(df, 2, var, na.rm=TRUE) != 0] | |
97 | +## rem_const_cols <- function(df){ | |
98 | +## ## res <- df[,apply(df, 2, var, na.rm=TRUE) != 0] | |
81 | 99 | |
82 | 100 | |
83 | -## res <- df[sapply(df, function(x) length(unique(na.omit(x)))) > 1] | |
101 | +## ## res <- df[sapply(df, function(x) length(unique(na.omit(x)))) > 1] | |
84 | 102 | |
85 | - res <- df %>% | |
86 | - select(where(~length(unique(na.omit(.x))) > 1)) | |
103 | +## res <- df %>% | |
104 | +## select(where(~length(unique(na.omit(.x))) > 1)) | |
87 | 105 | |
88 | 106 | |
89 | 107 | |
90 | -return(res) | |
108 | +## return(res) | |
91 | 109 | |
92 | -} | |
93 | - | |
110 | +## } | |
94 | 111 | |
95 | 112 | |
96 | 113 | |
97 | 114 | |
98 | -clean_data <- function(x){ | |
115 | +## ## this function makes the column names machine-friendly and removes | |
116 | +## ## duplicated rows, duplicated columns and constant rows | |
99 | 117 | |
100 | - res <- x %>% | |
101 | - clean_names() %>% | |
102 | - remove_empty() %>% | |
103 | - distinct() %>% | |
104 | - rem_dupl_cols() %>% | |
105 | - rem_const_cols() | |
118 | +## clean_data <- function(x){ | |
106 | 119 | |
107 | - return(res) | |
120 | +## res <- x %>% | |
121 | +## clean_names() %>% | |
122 | +## remove_empty() %>% | |
123 | +## distinct() %>% | |
124 | +## rem_dupl_cols() %>% | |
125 | +## rem_const_cols() | |
126 | + | |
127 | +## return(res) | |
108 | 128 | |
109 | 129 | |
110 | -} | |
130 | +## } | |
111 | 131 | |
112 | 132 | |
113 | 133 | |
114 | -na_to_pattern <- function(df, pattern){ | |
134 | +## ## this function changes all the missing data in a table to the desired pattern | |
115 | 135 | |
116 | -res <- df %>% replace(., is.na(.), pattern) | |
136 | +## na_to_pattern <- function(df, pattern){ | |
117 | 137 | |
118 | -return(res) | |
138 | +## res <- df %>% replace(., is.na(.), pattern) | |
139 | + | |
140 | +## return(res) | |
119 | 141 | |
120 | -} | |
142 | +## } | |
121 | 143 | |
122 | 144 | |
123 | 145 | |
124 | -save_excel <- function(output, fileName, sheetName="data", na_yes = TRUE,...){ | |
125 | -tryCatch({ | |
146 | +## ### This function is used to save the output of the calculations as an excel file | |
126 | 147 | |
127 | - wb <- loadWorkbook(fileName) | |
128 | - addWorksheet(wb = wb, sheet = sheetName) | |
129 | - writeData(wb = wb, sheet = sheetName, x = output, colNames = T, rowNames = F, | |
130 | - keepNA=na_yes ,...) | |
131 | - saveWorkbook(wb = wb, file = fileName, overwrite = T) | |
132 | -}, | |
133 | -error = function(err){ | |
148 | +## save_excel <- function(output, fileName, sheetName="data", na_yes = TRUE,...){ | |
149 | +## tryCatch({ | |
134 | 150 | |
135 | - wb <- createWorkbook(fileName) | |
136 | - addWorksheet(wb = wb, sheet = sheetName) | |
137 | - writeData(wb = wb, sheet = sheetName, x = output, colNames = T, rowNames = F, keepNA=na_yes , ... ) | |
138 | - saveWorkbook(wb = wb, file = fileName, overwrite = T) | |
139 | -}) | |
140 | -} | |
151 | +## wb <- loadWorkbook(fileName) | |
152 | +## addWorksheet(wb = wb, sheet = sheetName) | |
153 | +## writeData(wb = wb, sheet = sheetName, x = output, colNames = T, rowNames = F, | |
154 | +## keepNA=na_yes ,...) | |
155 | +## saveWorkbook(wb = wb, file = fileName, overwrite = T) | |
156 | +## }, | |
157 | +## error = function(err){ | |
158 | + | |
159 | +## wb <- createWorkbook(fileName) | |
160 | +## addWorksheet(wb = wb, sheet = sheetName) | |
161 | +## writeData(wb = wb, sheet = sheetName, x = output, colNames = T, rowNames = F, keepNA=na_yes , ... ) | |
162 | +## saveWorkbook(wb = wb, file = fileName, overwrite = T) | |
163 | +## }) | |
164 | +## } | |
141 | 165 | |
142 | 166 | |
143 | 167 |
@@ -149,6 +173,10 @@ | ||
149 | 173 | |
150 | 174 | |
151 | 175 | |
176 | +## tf_list and tf_list2 are two lists of the same temporary framework | |
177 | +## chapters. They are written in a different way because they need to be | |
178 | +## looked for (grepped) in different places | |
179 | + | |
152 | 180 | tf_list <- c("TF1:", "TF2:", "TF3:", ## "TF4", |
153 | 181 | ## "TF5", |
154 | 182 | "TF6:", "TF7:", "TF8:", |
@@ -161,33 +189,37 @@ | ||
161 | 189 | "TF9", "TF10", "TF11", "TF12") |
162 | 190 | |
163 | 191 | |
192 | + | |
193 | +### import the raw data about the amendments | |
164 | 194 | df_am_ini <- read_csv("amendments.csv") |
165 | 195 | |
166 | - | |
196 | +## import the raw data about the decisions | |
167 | 197 | df_dec_ini <- read_csv("decisions.csv", , locale = readr::locale(encoding = "latin1") ) |
168 | 198 | |
169 | 199 | |
170 | - ## read.csv("decisions.csv", fileEncoding="latin1") | |
171 | - | |
172 | 200 | |
173 | 201 | df_am <- df_am_ini %>% |
174 | - clean_data() | |
202 | + clean_data() ### clean the amendments data | |
175 | 203 | |
176 | 204 | df_dec <- df_dec_ini %>% |
177 | - clean_data() | |
205 | + clean_data() ### clean the decision data | |
178 | 206 | |
179 | -rep1_ini <- read_csv("report1_detailed.csv") | |
207 | +rep1_ini <- read_csv("report1_detailed.csv") ## import the raw data from the | |
208 | +## full report | |
180 | 209 | |
181 | 210 | |
182 | 211 | |
183 | 212 | rep1 <- rep1_ini %>% |
184 | - clean_data() | |
213 | + clean_data() ## clean the data of the full report | |
185 | 214 | |
186 | 215 | rep1_unique <- rep1 %>% |
187 | - distinct(case_reference, .keep_all=T) | |
216 | + distinct(case_reference, .keep_all=T) ## generates the equivalent of | |
217 | +## report 1 no duplicates in the excel tracker. This is a subset of the full report without duplicated case references. | |
188 | 218 | |
189 | 219 | |
190 | 220 | |
221 | +### budget calculation. The report without duplicated is broken down (grouped_by) member states (MS) and for each MS we calculate the sum of the confirmed budget. For each MS we also calculate the share of the total budget. | |
222 | +## A column with the totals is also added. | |
191 | 223 | |
192 | 224 | budget_test <- rep1_unique %>% |
193 | 225 | group_by(member_state_2_letter_code) %>% |
@@ -195,151 +227,16 @@ | ||
195 | 227 | ungroup %>% |
196 | 228 | mutate(share=budget/sum(budget)) %>% |
197 | 229 | add_total(nrow(.)+1) |
198 | - | |
199 | - | |
200 | - | |
201 | - | |
202 | -decisions_test <- rep1_unique %>% | |
203 | - filter(!is.na(legal_basis))%>% | |
204 | - group_by(member_state_2_letter_code) %>% | |
205 | - summarise(n_decisions=n()) %>% | |
206 | - ungroup() %>% | |
207 | - arrange(member_state_2_letter_code)%>% | |
208 | - add_total(nrow(.)+1) | |
209 | - | |
210 | - | |
211 | - | |
212 | -count_am <- df_am %>% | |
213 | - group_by(member_state_of_amendment) %>% | |
214 | - summarise(n_amendments=length(unique(amendment_case_number))) %>% | |
215 | - ungroup%>% | |
216 | - add_total(nrow(.)+1) | |
217 | - | |
218 | - | |
219 | - | |
220 | -duration_calc <- rep1 %>% | |
221 | - filter(register_code %in% c("N", "PN"), | |
222 | - decided=="YES") %>% | |
223 | - select(case_reference, member_state_2_letter_code, average_duration, | |
224 | - register_code, legal_basis, type_of_measure )%>% | |
225 | - pivot_wider(names_from="register_code", values_from="average_duration") %>% | |
226 | - ## na_to_pattern(0) %>% | |
227 | - mutate(type=if_else(is.na(PN), "N", "P+N")) %>% | |
228 | - rowwise() %>% | |
229 | - mutate(total_duration=sum(N,PN, na.rm=T)) %>% | |
230 | - ungroup | |
231 | - | |
232 | - | |
233 | -## save_excel(duration_calc, "duration_raw_data.xlsx") | |
234 | - | |
235 | - | |
236 | - | |
237 | - | |
238 | -duration_stat_ms <- duration_calc %>% | |
239 | - group_by(member_state_2_letter_code) %>% | |
240 | - summarise(mean_duration=mean(total_duration, na.rm=T), | |
241 | - median_duration=median(total_duration, na.rm=T), | |
242 | - mean_PN_duration=mean(PN, na.rm=T), | |
243 | - median_PN_duration=median(PN, na.rm=T), | |
244 | - mean_N_duration=mean(N, na.rm=T), | |
245 | - median_N_duration=median(N, na.rm=T), | |
246 | - number_data_points_N=n()-sum(is.na(N)), | |
247 | - number_data_points_PN=n()-sum(is.na(PN)) | |
248 | - )%>% | |
249 | - ungroup | |
250 | - | |
251 | -## save_excel(duration_stat_ms, "duration_by_MS.xlsx") | |
252 | - | |
253 | 230 | |
254 | 231 | |
255 | -duration_stat_eu <- duration_calc %>% | |
256 | - ## group_by(legal_basis) %>% | |
257 | - summarise(mean_duration=mean(total_duration, na.rm=T), | |
258 | - median_duration=median(total_duration, na.rm=T), | |
259 | - mean_PN_duration=mean(PN, na.rm=T), | |
260 | - median_PN_duration=median(PN, na.rm=T), | |
261 | - mean_N_duration=mean(N, na.rm=T), | |
262 | - median_N_duration=median(N, na.rm=T), | |
263 | - number_data_points_N=n()-sum(is.na(N)), | |
264 | - number_data_points_PN=n()-sum(is.na(PN))) %>% | |
265 | - ungroup | |
266 | - | |
267 | -## save_excel(duration_stat_eu, "duration_all_eu.xlsx") | |
268 | - | |
269 | -duration_stat_basis <- duration_calc %>% | |
270 | - group_by(legal_basis) %>% | |
271 | - summarise(mean_duration=mean(total_duration, na.rm=T), | |
272 | - median_duration=median(total_duration, na.rm=T), | |
273 | - mean_PN_duration=mean(PN, na.rm=T), | |
274 | - median_PN_duration=median(PN, na.rm=T), | |
275 | - mean_N_duration=mean(N, na.rm=T), | |
276 | - median_N_duration=median(N, na.rm=T), | |
277 | - number_data_points_N=n()-sum(is.na(N)), | |
278 | - number_data_points_PN=n()-sum(is.na(PN)))%>% | |
279 | - ungroup | |
280 | - | |
281 | -## save_excel(duration_stat_basis, "duration_by_legal_basis.xlsx") | |
282 | - | |
283 | - | |
284 | - | |
285 | - | |
286 | -duration_stat_basis_ms <- duration_calc %>% | |
287 | - group_by(member_state_2_letter_code,legal_basis) %>% | |
288 | - summarise(mean_duration=mean(total_duration, na.rm=T), | |
289 | - median_duration=median(total_duration, na.rm=T), | |
290 | - mean_PN_duration=mean(PN, na.rm=T), | |
291 | - median_PN_duration=median(PN, na.rm=T), | |
292 | - mean_N_duration=mean(N, na.rm=T), | |
293 | - median_N_duration=median(N, na.rm=T), | |
294 | - number_data_points_N=n()-sum(is.na(N)), | |
295 | - number_data_points_PN=n()-sum(is.na(PN)))%>% | |
296 | - ungroup %>% | |
297 | - arrange(member_state_2_letter_code) | |
298 | - | |
299 | - | |
300 | - | |
301 | - | |
302 | - | |
303 | - | |
304 | - | |
305 | -duration_stat_measure <- duration_calc %>% | |
306 | - group_by(type_of_measure) %>% | |
307 | - summarise(mean_duration=mean(total_duration, na.rm=T), | |
308 | - median_duration=median(total_duration, na.rm=T), | |
309 | - mean_PN_duration=mean(PN, na.rm=T), | |
310 | - median_PN_duration=median(PN, na.rm=T), | |
311 | - mean_N_duration=mean(N, na.rm=T), | |
312 | - median_N_duration=median(N, na.rm=T), | |
313 | - number_data_points_N=n()-sum(is.na(N)), | |
314 | - number_data_points_PN=n()-sum(is.na(PN)))%>% | |
315 | - ungroup | |
316 | - | |
317 | - | |
318 | - | |
319 | - | |
320 | -duration_stat_measure_ms <- duration_calc %>% | |
321 | - group_by(member_state_2_letter_code,type_of_measure) %>% | |
322 | - summarise(mean_duration=mean(total_duration, na.rm=T), | |
323 | - median_duration=median(total_duration, na.rm=T), | |
324 | - mean_PN_duration=mean(PN, na.rm=T), | |
325 | - median_PN_duration=median(PN, na.rm=T), | |
326 | - mean_N_duration=mean(N, na.rm=T), | |
327 | - median_N_duration=median(N, na.rm=T), | |
328 | - number_data_points_N=n()-sum(is.na(N)), | |
329 | - number_data_points_PN=n()-sum(is.na(PN)))%>% | |
330 | - ungroup %>% | |
331 | - arrange(member_state_2_letter_code) | |
332 | - | |
333 | - | |
334 | - | |
335 | - | |
336 | -## save_excel(duration_stat_measure, "duration_by_legal_measure.xlsx") | |
337 | 232 | |
338 | 233 | |
339 | 234 | aid_type <- tibble(code=c("AH", "IA", "S"), |
340 | 235 | name=c("Ad hoc", "Individual application within a scheme", |
341 | 236 | "Schemes")) |
342 | 237 | |
238 | +### budget broken by type | |
239 | + | |
343 | 240 | budget_type <- rep1_unique %>% |
344 | 241 | group_by(## member_state_2_letter_code, |
345 | 242 | case_type_code) %>% |
@@ -352,6 +249,7 @@ | ||
352 | 249 | add_total(nrow(.)+1) |
353 | 250 | |
354 | 251 | |
252 | +### budget broken by type and MS | |
355 | 253 | |
356 | 254 | budget_type_ms <- rep1_unique %>% |
357 | 255 | group_by( member_state_2_letter_code, |
@@ -370,7 +268,7 @@ | ||
370 | 268 | |
371 | 269 | |
372 | 270 | |
373 | - | |
271 | +#### budget broken by measure | |
374 | 272 | |
375 | 273 | |
376 | 274 | budget_measure <- rep1_unique %>% |
@@ -384,7 +282,7 @@ | ||
384 | 282 | |
385 | 283 | |
386 | 284 | |
387 | - | |
285 | +#### budget broken by measure and MS | |
388 | 286 | |
389 | 287 | budget_measure_ms <- rep1_unique %>% |
390 | 288 | group_by( member_state_2_letter_code, |
@@ -405,9 +303,107 @@ | ||
405 | 303 | |
406 | 304 | |
407 | 305 | |
306 | +### budget broken by beneficiary sector | |
408 | 307 | |
409 | 308 | |
410 | 309 | |
310 | +budget_by_beneficiary_sector <- rep1_unique %>% | |
311 | + group_by(beneficary_sector) %>% | |
312 | + summarise(budget=sum(confirmed_budgets, na.rm=T)) %>% | |
313 | + ungroup %>% | |
314 | + filter(complete.cases(.)) %>% | |
315 | + mutate(share=budget/sum(budget)) %>% | |
316 | + arrange(desc(budget))%>% | |
317 | + add_total(nrow(.)+1) | |
318 | + | |
319 | + | |
320 | +### budget broken by MS and beneficiary sector | |
321 | + | |
322 | +budget_by_beneficiary_sector_ms <- rep1_unique %>% | |
323 | + group_by(member_state_2_letter_code,beneficary_sector) %>% | |
324 | + summarise(budget=sum(confirmed_budgets, na.rm=T)) %>% | |
325 | + na.omit %>% | |
326 | + mutate(share=budget/sum(budget)) %>% | |
327 | + group_modify(~ .x %>% | |
328 | + adorn_totals("row")) %>% | |
329 | + ungroup%>% | |
330 | + complete(member_state_2_letter_code,beneficary_sector) %>% | |
331 | + na_to_pattern(0) | |
332 | + | |
333 | + | |
334 | + | |
335 | + | |
336 | +### budget broken by beneficiary type | |
337 | + | |
338 | + | |
339 | +budget_by_beneficiary_type <- rep1_unique %>% | |
340 | + group_by(beneficary_type) %>% | |
341 | + summarise(budget=sum(confirmed_budgets, na.rm=T)) %>% | |
342 | + ungroup %>% | |
343 | + filter(complete.cases(.)) %>% | |
344 | + mutate(share=budget/sum(budget)) %>% | |
345 | + arrange(desc(budget))%>% | |
346 | + add_total(nrow(.)+1) | |
347 | + | |
348 | + | |
349 | + | |
350 | +### budget broken by beneficiary type and MS | |
351 | + | |
352 | + | |
353 | +budget_by_beneficiary_type_ms <- rep1_unique %>% | |
354 | + group_by(member_state_2_letter_code,beneficary_type) %>% | |
355 | + summarise(budget=sum(confirmed_budgets, na.rm=T)) %>% | |
356 | + na.omit %>% | |
357 | + mutate(share=budget/sum(budget)) %>% | |
358 | + group_modify(~ .x %>% | |
359 | + adorn_totals("row")) %>% | |
360 | + ungroup%>% | |
361 | + complete(member_state_2_letter_code,beneficary_type) %>% | |
362 | + na_to_pattern(0) | |
363 | + | |
364 | + | |
365 | + | |
366 | + | |
367 | + | |
368 | + | |
369 | + | |
370 | + | |
371 | + | |
372 | + | |
373 | + | |
374 | + | |
375 | + | |
376 | + | |
377 | + | |
378 | + | |
379 | + | |
380 | + | |
381 | + | |
382 | + | |
383 | + | |
384 | + | |
385 | + | |
386 | + | |
387 | + | |
388 | + | |
389 | + | |
390 | +### statistics on the number of decisions. | |
391 | +## from the report file without duplicates, we remove those rows for which | |
392 | +## the legal basis is missing. We break down the remaining data by MS and | |
393 | +## we count the number of decisions for each MS (just the number of lines for each MS) and we add a total row | |
394 | + | |
395 | +decisions_test <- rep1_unique %>% | |
396 | + filter(!is.na(legal_basis))%>% | |
397 | + group_by(member_state_2_letter_code) %>% | |
398 | + summarise(n_decisions=n()) %>% | |
399 | + ungroup() %>% | |
400 | + arrange(member_state_2_letter_code)%>% | |
401 | + add_total(nrow(.)+1) | |
402 | + | |
403 | + | |
404 | + | |
405 | +### number of decisions by legal_basis | |
406 | + | |
411 | 407 | decisions_eu <- rep1_unique %>% |
412 | 408 | filter(!is.na(legal_basis))%>% |
413 | 409 | group_by(legal_basis) %>% |
@@ -418,6 +414,8 @@ | ||
418 | 414 | add_total(nrow(.)+1) |
419 | 415 | |
420 | 416 | |
417 | +### number of decisions by legal_basis and MS | |
418 | + | |
421 | 419 | |
422 | 420 | decisions_eu_ms <- rep1_unique %>% |
423 | 421 | filter(!is.na(legal_basis))%>% |
@@ -436,6 +434,300 @@ | ||
436 | 434 | |
437 | 435 | |
438 | 436 | |
437 | + | |
438 | +## statistics on the number of amendements. We use the first two columns of | |
439 | +## the amendments file. We break the amendment data by MS and we count the number of amendments for each MS as the number of *unique* amendment case numbers (in case some of them are repeated) | |
440 | + | |
441 | + | |
442 | +count_am <- df_am %>% | |
443 | + group_by(member_state_of_amendment) %>% | |
444 | + summarise(n_amendments=length(unique(amendment_case_number))) %>% | |
445 | + ungroup%>% | |
446 | + add_total(nrow(.)+1) | |
447 | + | |
448 | + | |
449 | +### this the fundamental table for the duration calculations | |
450 | +## we consider the full report and we filter the register code to be | |
451 | +## either N of PN and the case needs to have been decided (i.e. decided = YES) | |
452 | +## we select the columns case_reference, member_state_2_letter_code, | |
453 | +## average_duration, register_code, legal_basis, type_of_measure | |
454 | +## and we pivot the table. | |
455 | + | |
456 | +## Finally we obtain a table with the structure | |
457 | + | |
458 | +## case_reference member_state_2_… legal_basis type_of_measure N PN type total_duration | |
459 | + | |
460 | +## in which we have the duration of the notification and prenotification phase of all the decided cases | |
461 | + | |
462 | +duration_calc <- rep1 %>% | |
463 | + filter(register_code %in% c("N", "PN"), | |
464 | + decided=="YES") %>% | |
465 | + select(case_reference, member_state_2_letter_code, average_duration, | |
466 | + register_code, legal_basis, type_of_measure )%>% | |
467 | + pivot_wider(names_from="register_code", values_from="average_duration") %>% | |
468 | + ## na_to_pattern(0) %>% | |
469 | + mutate(type=if_else(is.na(PN), "N", "P+N")) %>% | |
470 | + rowwise() %>% | |
471 | + mutate(total_duration=sum(N,PN, na.rm=T)) %>% | |
472 | + ungroup | |
473 | + | |
474 | + | |
475 | +## save_excel(duration_calc, "duration_decisions_raw_data.xlsx") | |
476 | + | |
477 | + | |
478 | + | |
479 | + | |
480 | + | |
481 | + | |
482 | + | |
483 | + | |
484 | +## we break the duration_calc data by MS and we calculate statistics | |
485 | +## on the duration of the N, PN and combined N+PN process. | |
486 | + | |
487 | +duration_stat_ms <- duration_calc %>% | |
488 | + group_by(member_state_2_letter_code) %>% | |
489 | + summarise(mean_duration=mean(total_duration, na.rm=T), | |
490 | + median_duration=median(total_duration, na.rm=T), | |
491 | + mean_PN_duration=mean(PN, na.rm=T), | |
492 | + median_PN_duration=median(PN, na.rm=T), | |
493 | + mean_N_duration=mean(N, na.rm=T), | |
494 | + median_N_duration=median(N, na.rm=T), | |
495 | + number_data_points_N=n()-sum(is.na(N)), | |
496 | + number_data_points_PN=n()-sum(is.na(PN)) | |
497 | + )%>% | |
498 | + ungroup | |
499 | + | |
500 | +## save_excel(duration_stat_ms, "duration_by_MS.xlsx") | |
501 | + | |
502 | + | |
503 | +### As above, but without breaking the data by MS ---> we calculate the stats for the whole EU. | |
504 | + | |
505 | +duration_stat_eu <- duration_calc %>% | |
506 | + ## group_by(legal_basis) %>% | |
507 | + summarise(mean_duration=mean(total_duration, na.rm=T), | |
508 | + median_duration=median(total_duration, na.rm=T), | |
509 | + mean_PN_duration=mean(PN, na.rm=T), | |
510 | + median_PN_duration=median(PN, na.rm=T), | |
511 | + mean_N_duration=mean(N, na.rm=T), | |
512 | + median_N_duration=median(N, na.rm=T), | |
513 | + number_data_points_N=n()-sum(is.na(N)), | |
514 | + number_data_points_PN=n()-sum(is.na(PN))) %>% | |
515 | + ungroup | |
516 | + | |
517 | +## save_excel(duration_stat_eu, "duration_all_eu.xlsx") | |
518 | + | |
519 | + | |
520 | +#### Now the duration statistics is calculated by breaking the data according | |
521 | +## to the legal basis | |
522 | + | |
523 | +duration_stat_basis <- duration_calc %>% | |
524 | + group_by(legal_basis) %>% | |
525 | + summarise(mean_duration=mean(total_duration, na.rm=T), | |
526 | + median_duration=median(total_duration, na.rm=T), | |
527 | + mean_PN_duration=mean(PN, na.rm=T), | |
528 | + median_PN_duration=median(PN, na.rm=T), | |
529 | + mean_N_duration=mean(N, na.rm=T), | |
530 | + median_N_duration=median(N, na.rm=T), | |
531 | + number_data_points_N=n()-sum(is.na(N)), | |
532 | + number_data_points_PN=n()-sum(is.na(PN)))%>% | |
533 | + ungroup | |
534 | + | |
535 | +## save_excel(duration_stat_basis, "duration_by_legal_basis.xlsx") | |
536 | + | |
537 | + | |
538 | +## Now the data is broken both by legal basis and by duration statistics. | |
539 | + | |
540 | +duration_stat_basis_ms <- duration_calc %>% | |
541 | + group_by(member_state_2_letter_code,legal_basis) %>% | |
542 | + summarise(mean_duration=mean(total_duration, na.rm=T), | |
543 | + median_duration=median(total_duration, na.rm=T), | |
544 | + mean_PN_duration=mean(PN, na.rm=T), | |
545 | + median_PN_duration=median(PN, na.rm=T), | |
546 | + mean_N_duration=mean(N, na.rm=T), | |
547 | + median_N_duration=median(N, na.rm=T), | |
548 | + number_data_points_N=n()-sum(is.na(N)), | |
549 | + number_data_points_PN=n()-sum(is.na(PN)))%>% | |
550 | + ungroup %>% | |
551 | + arrange(member_state_2_letter_code) | |
552 | + | |
553 | + | |
554 | + | |
555 | + | |
556 | + | |
557 | +#### now the duration statistics is broken by type of measure. | |
558 | + | |
559 | +duration_stat_measure <- duration_calc %>% | |
560 | + group_by(type_of_measure) %>% | |
561 | + summarise(mean_duration=mean(total_duration, na.rm=T), | |
562 | + median_duration=median(total_duration, na.rm=T), | |
563 | + mean_PN_duration=mean(PN, na.rm=T), | |
564 | + median_PN_duration=median(PN, na.rm=T), | |
565 | + mean_N_duration=mean(N, na.rm=T), | |
566 | + median_N_duration=median(N, na.rm=T), | |
567 | + number_data_points_N=n()-sum(is.na(N)), | |
568 | + number_data_points_PN=n()-sum(is.na(PN)))%>% | |
569 | + ungroup | |
570 | + | |
571 | + | |
572 | +### and now it is also further broken by MS. | |
573 | + | |
574 | +duration_stat_measure_ms <- duration_calc %>% | |
575 | + group_by(member_state_2_letter_code,type_of_measure) %>% | |
576 | + summarise(mean_duration=mean(total_duration, na.rm=T), | |
577 | + median_duration=median(total_duration, na.rm=T), | |
578 | + mean_PN_duration=mean(PN, na.rm=T), | |
579 | + median_PN_duration=median(PN, na.rm=T), | |
580 | + mean_N_duration=mean(N, na.rm=T), | |
581 | + median_N_duration=median(N, na.rm=T), | |
582 | + number_data_points_N=n()-sum(is.na(N)), | |
583 | + number_data_points_PN=n()-sum(is.na(PN)))%>% | |
584 | + ungroup %>% | |
585 | + arrange(member_state_2_letter_code) | |
586 | + | |
587 | + | |
588 | + | |
589 | + | |
590 | + | |
591 | + | |
592 | + | |
593 | + | |
594 | + | |
595 | + | |
596 | + | |
597 | + | |
598 | + | |
599 | + | |
600 | + | |
601 | + | |
602 | + | |
603 | + | |
604 | + | |
605 | + | |
606 | +#### I keep only some columns of the duration calc table and I rename them | |
607 | +duration_calc_restricted <- duration_calc %>% | |
608 | + select(case_reference,member_state_2_letter_code, N, PN) %>% | |
609 | + na_to_pattern(0) %>% | |
610 | + mutate(total_duration=N+PN) %>% | |
611 | + select(-c(N, PN)) %>% | |
612 | + rename("MS"="member_state_2_letter_code") %>% | |
613 | + mutate(type="decision") | |
614 | + | |
615 | + | |
616 | + | |
617 | + | |
618 | +###file with the raw data on the duration of the amendments. | |
619 | +### it is important to know that when a PN notification has a duration of zero, | |
620 | +## it actually means that it had never taken place and so the zeros should be | |
621 | +## treated as missing data. | |
622 | + | |
623 | +## I also keep only the distinct amendment numbers to avoid double-counting | |
624 | +## the amendments. | |
625 | + | |
626 | +duration_calc_am <- df_am %>% | |
627 | + distinct(amendment_case_number, .keep_all=T) %>% | |
628 | + select(case_number_key , amendment_case_number,member_state_of_amendment, | |
629 | + duration_of_n_phase_calendar_days, duration_of_pn_phase_calendar_days,total_duration_pn_n_calendar_days | |
630 | + ) %>% | |
631 | + pattern_to_na(0) %>% | |
632 | + mutate(is_PN_plus_N=if_else(!is.na(duration_of_pn_phase_calendar_days), | |
633 | + "yes", "no")) | |
634 | + | |
635 | + | |
636 | +### I now select only some columns of the duration_calc table | |
637 | +duration_calc_am_restricted <- duration_calc_am %>% | |
638 | + select(case_number_key,member_state_of_amendment,total_duration_pn_n_calendar_days ) %>% | |
639 | + rename("case_reference"="case_number_key", | |
640 | + "MS"="member_state_of_amendment", | |
641 | + "total_duration"="total_duration_pn_n_calendar_days") %>% | |
642 | + mutate(type="amendment") | |
643 | + | |
644 | + | |
645 | + | |
646 | +#### Here I create a table where for every case I list the time to take a decision and the duration of each amendment process. | |
647 | + | |
648 | +duration_am_plus_decisions <- bind_rows(duration_calc_am_restricted, | |
649 | + duration_calc_restricted) %>% | |
650 | + arrange(MS, case_reference) | |
651 | + | |
652 | +save_excel(duration_am_plus_decisions, "duration_decisions_amendments_raw_data.xlsx") | |
653 | + | |
654 | + | |
655 | +## I calculate some statistics by MS on the collected data on durations and decisions. | |
656 | + | |
657 | +duration_am_plus_decisions_ms <- duration_am_plus_decisions %>% | |
658 | + group_by(MS) %>% | |
659 | + summarise(mean_duration=mean(total_duration), | |
660 | + median_duration=median(total_duration)) %>% | |
661 | + ungroup | |
662 | + | |
663 | + | |
664 | +## and now the same stats at the EU level. | |
665 | +duration_am_plus_decisions_eu <- duration_am_plus_decisions %>% | |
666 | + ## group_by(MS) %>% | |
667 | + summarise(mean_duration=mean(total_duration), | |
668 | + median_duration=median(total_duration)) ## %>% | |
669 | + ## ungroup | |
670 | + | |
671 | + | |
672 | + | |
673 | +### I now calculate the statistics on the duration of the amendments process | |
674 | +## at the MS level | |
675 | + | |
676 | +duration_stat_am_ms <- duration_calc_am %>% | |
677 | + group_by(member_state_of_amendment) %>% | |
678 | + summarise(mean_duration=mean(total_duration_pn_n_calendar_days, na.rm=T), | |
679 | + median_duration=median(total_duration_pn_n_calendar_days, na.rm=T), | |
680 | + mean_PN_duration=mean(duration_of_pn_phase_calendar_days , | |
681 | + na.rm=T), | |
682 | + median_PN_duration=median(duration_of_pn_phase_calendar_days , | |
683 | + na.rm=T), | |
684 | + mean_N_duration=mean(duration_of_n_phase_calendar_days, na.rm=T), | |
685 | + median_N_duration=median(duration_of_n_phase_calendar_days, na.rm=T), | |
686 | + number_data_points_N=n()-sum(is.na(duration_of_n_phase_calendar_days)), | |
687 | + number_data_points_PN=n()-sum(is.na(duration_of_pn_phase_calendar_days)) | |
688 | + )%>% | |
689 | + ungroup | |
690 | + | |
691 | + | |
692 | + | |
693 | + | |
694 | +## and now the same stats at the EU level. | |
695 | + | |
696 | +duration_stat_am_eu <- duration_calc_am %>% | |
697 | + ## group_by(member_state_of_amendment) %>% | |
698 | + summarise(mean_duration=mean(total_duration_pn_n_calendar_days, na.rm=T), | |
699 | + median_duration=median(total_duration_pn_n_calendar_days, na.rm=T), | |
700 | + mean_PN_duration=mean(duration_of_pn_phase_calendar_days , | |
701 | + na.rm=T), | |
702 | + median_PN_duration=median(duration_of_pn_phase_calendar_days , | |
703 | + na.rm=T), | |
704 | + mean_N_duration=mean(duration_of_n_phase_calendar_days, na.rm=T), | |
705 | + median_N_duration=median(duration_of_n_phase_calendar_days, na.rm=T), | |
706 | + number_data_points_N=n()-sum(is.na(duration_of_n_phase_calendar_days)), | |
707 | + number_data_points_PN=n()-sum(is.na(duration_of_pn_phase_calendar_days)) | |
708 | + )## %>% | |
709 | + ## ungroup | |
710 | + | |
711 | + | |
712 | + | |
713 | + | |
714 | + | |
715 | + | |
716 | + | |
717 | + | |
718 | +## save_excel(duration_stat_measure, "duration_by_legal_measure.xlsx") | |
719 | + | |
720 | + | |
721 | + | |
722 | + | |
723 | + | |
724 | + | |
725 | + | |
726 | + | |
727 | + | |
728 | + | |
729 | +### This time we just count the number of measures by legal basis | |
730 | + | |
439 | 731 | measures_covid <- rep1_unique %>% |
440 | 732 | filter(decided=="YES", !is.na(legal_basis))%>% |
441 | 733 | group_by(legal_basis) %>% |
@@ -445,7 +737,7 @@ | ||
445 | 737 | mutate(share=n_measures/sum(n_measures))%>% |
446 | 738 | add_total(nrow(.)+1) |
447 | 739 | |
448 | - | |
740 | +### now number of measures by legal basis and MS | |
449 | 741 | |
450 | 742 | measures_covid_ms <- rep1_unique %>% |
451 | 743 | filter(decided=="YES", !is.na(legal_basis))%>% |
@@ -464,6 +756,7 @@ | ||
464 | 756 | |
465 | 757 | |
466 | 758 | |
759 | +### Statistics on SA cases under assessment | |
467 | 760 | |
468 | 761 | sa_n_pn <- rep1_unique %>% |
469 | 762 | ## filter(decided=="YES", !is.na(legal_basis), |
@@ -476,7 +769,7 @@ | ||
476 | 769 | add_total(nrow(.)+1) |
477 | 770 | |
478 | 771 | |
479 | - | |
772 | +### as above, but also broken by MS | |
480 | 773 | sa_n_pn_ms <- rep1_unique %>% |
481 | 774 | ## filter(decided=="YES", !is.na(legal_basis), |
482 | 775 | ## register_code %in% c("N", "PN"))%>% |
@@ -494,7 +787,7 @@ | ||
494 | 787 | |
495 | 788 | |
496 | 789 | |
497 | - | |
790 | +### and this time broken by Unit dealing with them | |
498 | 791 | sa_n_pn_unit <- rep1_unique %>% |
499 | 792 | ## filter(decided=="YES", !is.na(legal_basis), |
500 | 793 | ## register_code %in% c("N", "PN"))%>% |
@@ -509,6 +802,7 @@ | ||
509 | 802 | |
510 | 803 | |
511 | 804 | |
805 | +### now broken by both unit and MS | |
512 | 806 | |
513 | 807 | sa_n_pn_unit_ms <- rep1_unique %>% |
514 | 808 | ## filter(decided=="YES", !is.na(legal_basis), |
@@ -529,65 +823,9 @@ | ||
529 | 823 | |
530 | 824 | |
531 | 825 | |
532 | - | |
533 | - | |
534 | -budget_by_beneficiary_sector <- rep1_unique %>% | |
535 | - group_by(beneficary_sector) %>% | |
536 | - summarise(budget=sum(confirmed_budgets, na.rm=T)) %>% | |
537 | - ungroup %>% | |
538 | - filter(complete.cases(.)) %>% | |
539 | - mutate(share=budget/sum(budget)) %>% | |
540 | - arrange(desc(budget))%>% | |
541 | - add_total(nrow(.)+1) | |
542 | - | |
543 | - | |
544 | - | |
545 | - | |
546 | -budget_by_beneficiary_sector_ms <- rep1_unique %>% | |
547 | - group_by(member_state_2_letter_code,beneficary_sector) %>% | |
548 | - summarise(budget=sum(confirmed_budgets, na.rm=T)) %>% | |
549 | - na.omit %>% | |
550 | - mutate(share=budget/sum(budget)) %>% | |
551 | - group_modify(~ .x %>% | |
552 | - adorn_totals("row")) %>% | |
553 | - ungroup%>% | |
554 | - complete(member_state_2_letter_code,beneficary_sector) %>% | |
555 | - na_to_pattern(0) | |
556 | - | |
557 | - | |
558 | - | |
559 | - | |
560 | - | |
561 | - | |
562 | - | |
563 | -budget_by_beneficiary_type <- rep1_unique %>% | |
564 | - group_by(beneficary_type) %>% | |
565 | - summarise(budget=sum(confirmed_budgets, na.rm=T)) %>% | |
566 | - ungroup %>% | |
567 | - filter(complete.cases(.)) %>% | |
568 | - mutate(share=budget/sum(budget)) %>% | |
569 | - arrange(desc(budget))%>% | |
570 | - add_total(nrow(.)+1) | |
571 | - | |
572 | - | |
573 | - | |
574 | - | |
575 | - | |
576 | -budget_by_beneficiary_type_ms <- rep1_unique %>% | |
577 | - group_by(member_state_2_letter_code,beneficary_type) %>% | |
578 | - summarise(budget=sum(confirmed_budgets, na.rm=T)) %>% | |
579 | - na.omit %>% | |
580 | - mutate(share=budget/sum(budget)) %>% | |
581 | - group_modify(~ .x %>% | |
582 | - adorn_totals("row")) %>% | |
583 | - ungroup%>% | |
584 | - complete(member_state_2_letter_code,beneficary_type) %>% | |
585 | - na_to_pattern(0) | |
586 | - | |
587 | - | |
588 | - | |
589 | - | |
590 | - | |
826 | +### from the rep1_unique table, we filter the decided cases with procedure | |
827 | +## either N or PN. then we collapse all the text in type_of_measure column and we count how many times we find there the words in tf_list (i.e. the TF chapters) | |
828 | +## we want to track. | |
591 | 829 | |
592 | 830 | chapters_stat_test <- rep1_unique %>% |
593 | 831 | filter(decided=="YES", |
@@ -599,18 +837,19 @@ | ||
599 | 837 | mutate(share=value/sum(value))%>% |
600 | 838 | mutate(TF_chapter=tf_list) |
601 | 839 | |
602 | -## test1 <- rep1_unique %>% | |
603 | -## filter(decided=="YES", | |
604 | -## register_code %in% c("N", "PN"))%>% | |
605 | -## filter(grepl("TF3",type_of_measure) ) | |
606 | 840 | |
607 | -## test1bis <- rep1_unique %>% | |
608 | -## filter(case_reference=="SA.57574") | |
841 | +### The calculation above is correct but it does not allow one to see exactly | |
842 | +## to which TF chapter each case is associated to. | |
843 | + | |
609 | 844 | |
610 | 845 | tf <- rep1_unique %>% |
611 | 846 | filter(decided=="YES", |
612 | - register_code %in% c("N", "PN")) | |
613 | - | |
847 | + register_code %in% c("N", "PN")) ### I again take the decided | |
848 | + ### case either N or PN as register_code | |
849 | + | |
850 | +### here I use map_df to iterate the filtering of the tf dataframe according | |
851 | +## to the tf_list of chapters | |
852 | + | |
614 | 853 | tf_all <- map_df(tf_list, function(x) {filter(tf,grepl(x, type_of_measure)) %>% |
615 | 854 | mutate(TF=x) }) %>% |
616 | 855 | mutate(TF=remove_special_char(TF)) %>% |
@@ -623,6 +862,11 @@ | ||
623 | 862 | ## filter(grepl("TF1", type_of_measure)) |
624 | 863 | |
625 | 864 | |
865 | +### I need to treat separately the cases associated to more than 3 chapters | |
866 | +## because they are stored somewhere else. These calculations are along | |
867 | +## the lines of chapter_stat_test, but I need tf_list2 because the TF chapters | |
868 | +## are written a bit differently. | |
869 | + | |
626 | 870 | chapters_stat_extra <- df_dec %>% |
627 | 871 | pull(specify_tf_others_more_than_three_sections_combined) %>% |
628 | 872 | paste0(collapse = " ") %>% |
@@ -631,6 +875,7 @@ | ||
631 | 875 | mutate(share=value/sum(value))%>% |
632 | 876 | mutate(TF_chapter=tf_list) |
633 | 877 | |
878 | +### and once again, if I want to have the association between case and TF chapter, I run calculations similar to those for tf_all | |
634 | 879 | |
635 | 880 | tf_all2 <- map_df(tf_list2, function(x) {filter(df_dec, |
636 | 881 | grepl(x, specify_tf_others_more_than_three_sections_combined)) %>% |
@@ -640,14 +885,17 @@ | ||
640 | 885 | rename("description"="specify_tf_others_more_than_three_sections_combined") |
641 | 886 | |
642 | 887 | |
888 | +### and now I have the detailed situation for all cases and chapters | |
889 | + | |
643 | 890 | all_chapter_cases <- bind_rows(tf_all, tf_all2) |
644 | 891 | |
645 | -## save_excel(all_chapter_cases, "chapters_detailed.xlsx") | |
646 | - | |
647 | 892 | |
648 | 893 | |
649 | 894 | |
650 | 895 | |
896 | +#### If I just need the statistics on all the TF chapters, I can add the count | |
897 | +## data on chapter usage from chapter_stat_test and chapter_stat_extra | |
898 | + | |
651 | 899 | |
652 | 900 | chapters_stat_fin <- chapters_stat_test %>% |
653 | 901 | select(TF_chapter, value) %>% |
@@ -657,6 +905,8 @@ | ||
657 | 905 | mutate(TF_chapter=remove_special_char(TF_chapter)) |
658 | 906 | |
659 | 907 | |
908 | +#### I can finally use the detailed statistics in all_chapter cases | |
909 | +### to count the number of TF chapter per MS. | |
660 | 910 | |
661 | 911 | chapters_stat_fin_ms <- all_chapter_cases %>% |
662 | 912 | tabyl(member_state_code, TF) %>% |