OSDN > 開発者 > larry77 > 作業部屋 > myprojects-hg-reborn > コミット

myprojects-hg-reborn
Fork

(オリジナルレポジトリ: Fork元はありません)

コミット

コミットメタ情報

リビジョン	b0f8a88cf65c483105bda59b5b44f7d9711392e8 (tree)
日時	2021-02-19 23:47:03
作者	Lorenzo Isella <lorenzo.isella@gmai...>
コミッター	Lorenzo Isella

ログメッセージ

I added some extra stats and I commented most of the code.

変更サマリ

modified: R-codes/process_covid_tracker.R (diff)

差分

diff -r 5657114b3a2e -r b0f8a88cf65c R-codes/process_covid_tracker.R

--- a/R-codes/process_covid_tracker.R Wed Feb 17 22:48:55 2021 +0100

+++ b/R-codes/process_covid_tracker.R Fri Feb 19 15:47:03 2021 +0100

		@@ -1,6 +1,6 @@
1	1	rm(list=ls())
2	2
3		-## last saved on Time-stamp: "2021-02-15 11:06:30 lorenzo"
	3	+## last saved on Time-stamp: "2021-02-19 15:46:30 lorenzo"
4	4
5	5
6	6	library(tidyverse)

		@@ -9,135 +9,159 @@
9	9	library(openxlsx)
10	10	library(stringr)
11	11
12		-## source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R")
13		-
14		-remove_special_char <- function(x, new_pattern=""){
15		-
16		-## remove special characters from a column
17		-
18		-res <- str_replace_all(x, "[[:punct:]]", new_pattern)
19		-
20		-res <- str_replace_all(x, "[^[:alnum:]]", new_pattern)
21		-
22		-
23		-res <- str_trim(res, side ="both")
24		-
25		-
26		-return(res)
27		-
28		-}
29		-
	12	+library(kableExtra)
30	13
31		-move_row <- function(df, ini_pos, fin_pos){
32		-
33		-ll <- nrow(df)
34		-
35		-row_pick <- slice(df, ini_pos)
36		-
37		- if (fin_pos=="last"){
38		-
39		- res <- df %>%
40		- slice(-ini_pos) %>%
41		- add_row(row_pick, .before = ll)
42		-
43		-
44		-} else{
45		-
46		- res <- df %>%
47		- slice(-ini_pos) %>%
48		- add_row(row_pick, .before = fin_pos)
49		-}
50		-
51		- return(res)
52		-}
	14	+source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R")
53	15
54	16
55	17
56		-add_total <- function(x, pos=1, ...){
57		- adorn_totals(x, ...) %>% as_tibble %>%
58		- move_row(nrow(.), pos)
59		-}
	18	+## pattern_to_na <- function(df, pattern){
	19	+
	20	+## res <- df %>% na_if(., pattern)
	21	+
	22	+## return(res)
	23	+
	24	+## }
	25	+
	26	+
	27	+## ### This function removes punctuation and special characters
	28	+
	29	+## remove_special_char <- function(x, new_pattern=""){
	30	+
	31	+## ## remove special characters from a column
	32	+
	33	+## res <- str_replace_all(x, "[[:punct:]]", new_pattern)
	34	+
	35	+## res <- str_replace_all(x, "[^[:alnum:]]", new_pattern)
	36	+
	37	+
	38	+## res <- str_trim(res, side ="both")
	39	+
	40	+
	41	+## return(res)
	42	+
	43	+## }
	44	+
	45	+
	46	+## ## this function moves a given row of a table to the desired position
	47	+
	48	+## move_row <- function(df, ini_pos, fin_pos){
	49	+
	50	+## ll <- nrow(df)
	51	+
	52	+## row_pick <- slice(df, ini_pos)
	53	+
	54	+## if (fin_pos=="last"){
	55	+
	56	+## res <- df %>%
	57	+## slice(-ini_pos) %>%
	58	+## add_row(row_pick, .before = ll)
	59	+
	60	+
	61	+## } else{
	62	+
	63	+## res <- df %>%
	64	+## slice(-ini_pos) %>%
	65	+## add_row(row_pick, .before = fin_pos)
	66	+## }
	67	+
	68	+## return(res)
	69	+## }
	70	+
	71	+
	72	+## ## this function adds a total row to a table
	73	+
	74	+## add_total <- function(x, pos=1, ...){
	75	+## adorn_totals(x, ...) %>% as_tibble %>%
	76	+## move_row(nrow(.), pos)
	77	+## }
60	78
61	79
62	80
63	81
64		-## a function to remove duplicated columns (see http://bit.ly/2s0q6cC )
65		-
66		-rem_dupl_cols <- function(df){
	82	+## ## a function to remove duplicated columns (see http://bit.ly/2s0q6cC )
67	83
68		-res <- df[, !duplicated(t(df))]
	84	+## rem_dupl_cols <- function(df){
69	85
70		-return(res)
	86	+## res <- df[, !duplicated(t(df))]
71	87
72		-}
	88	+## return(res)
	89	+
	90	+## }
73	91
74	92
75		-## a function to remove constant columns (see http://bit.ly/2s1qqaY
76		-## and http://bit.ly/2s1Sb3q . The solution in the second link is better
77		-## because it works also with factors and characters )
	93	+## ## a function to remove constant columns (see http://bit.ly/2s1qqaY
	94	+## ## and http://bit.ly/2s1Sb3q . The solution in the second link is better
	95	+## ## because it works also with factors and characters )
78	96
79		-rem_const_cols <- function(df){
80		-## res <- df[,apply(df, 2, var, na.rm=TRUE) != 0]
	97	+## rem_const_cols <- function(df){
	98	+## ## res <- df[,apply(df, 2, var, na.rm=TRUE) != 0]
81	99
82	100
83		-## res <- df[sapply(df, function(x) length(unique(na.omit(x)))) > 1]
	101	+## ## res <- df[sapply(df, function(x) length(unique(na.omit(x)))) > 1]
84	102
85		- res <- df %>%
86		- select(where(~length(unique(na.omit(.x))) > 1))
	103	+## res <- df %>%
	104	+## select(where(~length(unique(na.omit(.x))) > 1))
87	105
88	106
89	107
90		-return(res)
	108	+## return(res)
91	109
92		-}
93		-
	110	+## }
94	111
95	112
96	113
97	114
98		-clean_data <- function(x){
	115	+## ## this function makes the column names machine-friendly and removes
	116	+## ## duplicated rows, duplicated columns and constant rows
99	117
100		- res <- x %>%
101		- clean_names() %>%
102		- remove_empty() %>%
103		- distinct() %>%
104		- rem_dupl_cols() %>%
105		- rem_const_cols()
	118	+## clean_data <- function(x){
106	119
107		- return(res)
	120	+## res <- x %>%
	121	+## clean_names() %>%
	122	+## remove_empty() %>%
	123	+## distinct() %>%
	124	+## rem_dupl_cols() %>%
	125	+## rem_const_cols()
	126	+
	127	+## return(res)
108	128
109	129
110		-}
	130	+## }
111	131
112	132
113	133
114		-na_to_pattern <- function(df, pattern){
	134	+## ## this function changes all the missing data in a table to the desired pattern
115	135
116		-res <- df %>% replace(., is.na(.), pattern)
	136	+## na_to_pattern <- function(df, pattern){
117	137
118		-return(res)
	138	+## res <- df %>% replace(., is.na(.), pattern)
	139	+
	140	+## return(res)
119	141
120		-}
	142	+## }
121	143
122	144
123	145
124		-save_excel <- function(output, fileName, sheetName="data", na_yes = TRUE,...){
125		-tryCatch({
	146	+## ### This function is used to save the output of the calculations as an excel file
126	147
127		- wb <- loadWorkbook(fileName)
128		- addWorksheet(wb = wb, sheet = sheetName)
129		- writeData(wb = wb, sheet = sheetName, x = output, colNames = T, rowNames = F,
130		- keepNA=na_yes ,...)
131		- saveWorkbook(wb = wb, file = fileName, overwrite = T)
132		-},
133		-error = function(err){
	148	+## save_excel <- function(output, fileName, sheetName="data", na_yes = TRUE,...){
	149	+## tryCatch({
134	150
135		- wb <- createWorkbook(fileName)
136		- addWorksheet(wb = wb, sheet = sheetName)
137		- writeData(wb = wb, sheet = sheetName, x = output, colNames = T, rowNames = F, keepNA=na_yes , ... )
138		- saveWorkbook(wb = wb, file = fileName, overwrite = T)
139		-})
140		-}
	151	+## wb <- loadWorkbook(fileName)
	152	+## addWorksheet(wb = wb, sheet = sheetName)
	153	+## writeData(wb = wb, sheet = sheetName, x = output, colNames = T, rowNames = F,
	154	+## keepNA=na_yes ,...)
	155	+## saveWorkbook(wb = wb, file = fileName, overwrite = T)
	156	+## },
	157	+## error = function(err){
	158	+
	159	+## wb <- createWorkbook(fileName)
	160	+## addWorksheet(wb = wb, sheet = sheetName)
	161	+## writeData(wb = wb, sheet = sheetName, x = output, colNames = T, rowNames = F, keepNA=na_yes , ... )
	162	+## saveWorkbook(wb = wb, file = fileName, overwrite = T)
	163	+## })
	164	+## }
141	165
142	166
143	167

		@@ -149,6 +173,10 @@
149	173
150	174
151	175
	176	+## tf_list and tf_list2 are two lists of the same temporary framework
	177	+## chapters. They are written in a different way because they need to be
	178	+## looked for (grepped) in different places
	179	+
152	180	tf_list <- c("TF1:", "TF2:", "TF3:", ## "TF4",
153	181	## "TF5",
154	182	"TF6:", "TF7:", "TF8:",

		@@ -161,33 +189,37 @@
161	189	"TF9", "TF10", "TF11", "TF12")
162	190
163	191
	192	+
	193	+### import the raw data about the amendments
164	194	df_am_ini <- read_csv("amendments.csv")
165	195
166		-
	196	+## import the raw data about the decisions
167	197	df_dec_ini <- read_csv("decisions.csv", , locale = readr::locale(encoding = "latin1") )
168	198
169	199
170		- ## read.csv("decisions.csv", fileEncoding="latin1")
171		-
172	200
173	201	df_am <- df_am_ini %>%
174		- clean_data()
	202	+ clean_data() ### clean the amendments data
175	203
176	204	df_dec <- df_dec_ini %>%
177		- clean_data()
	205	+ clean_data() ### clean the decision data
178	206
179		-rep1_ini <- read_csv("report1_detailed.csv")
	207	+rep1_ini <- read_csv("report1_detailed.csv") ## import the raw data from the
	208	+## full report
180	209
181	210
182	211
183	212	rep1 <- rep1_ini %>%
184		- clean_data()
	213	+ clean_data() ## clean the data of the full report
185	214
186	215	rep1_unique <- rep1 %>%
187		- distinct(case_reference, .keep_all=T)
	216	+ distinct(case_reference, .keep_all=T) ## generates the equivalent of
	217	+## report 1 no duplicates in the excel tracker. This is a subset of the full report without duplicated case references.
188	218
189	219
190	220
	221	+### budget calculation. The report without duplicated is broken down (grouped_by) member states (MS) and for each MS we calculate the sum of the confirmed budget. For each MS we also calculate the share of the total budget.
	222	+## A column with the totals is also added.
191	223
192	224	budget_test <- rep1_unique %>%
193	225	group_by(member_state_2_letter_code) %>%

		@@ -195,151 +227,16 @@
195	227	ungroup %>%
196	228	mutate(share=budget/sum(budget)) %>%
197	229	add_total(nrow(.)+1)
198		-
199		-
200		-
201		-
202		-decisions_test <- rep1_unique %>%
203		- filter(!is.na(legal_basis))%>%
204		- group_by(member_state_2_letter_code) %>%
205		- summarise(n_decisions=n()) %>%
206		- ungroup() %>%
207		- arrange(member_state_2_letter_code)%>%
208		- add_total(nrow(.)+1)
209		-
210		-
211		-
212		-count_am <- df_am %>%
213		- group_by(member_state_of_amendment) %>%
214		- summarise(n_amendments=length(unique(amendment_case_number))) %>%
215		- ungroup%>%
216		- add_total(nrow(.)+1)
217		-
218		-
219		-
220		-duration_calc <- rep1 %>%
221		- filter(register_code %in% c("N", "PN"),
222		- decided=="YES") %>%
223		- select(case_reference, member_state_2_letter_code, average_duration,
224		- register_code, legal_basis, type_of_measure )%>%
225		- pivot_wider(names_from="register_code", values_from="average_duration") %>%
226		- ## na_to_pattern(0) %>%
227		- mutate(type=if_else(is.na(PN), "N", "P+N")) %>%
228		- rowwise() %>%
229		- mutate(total_duration=sum(N,PN, na.rm=T)) %>%
230		- ungroup
231		-
232		-
233		-## save_excel(duration_calc, "duration_raw_data.xlsx")
234		-
235		-
236		-
237		-
238		-duration_stat_ms <- duration_calc %>%
239		- group_by(member_state_2_letter_code) %>%
240		- summarise(mean_duration=mean(total_duration, na.rm=T),
241		- median_duration=median(total_duration, na.rm=T),
242		- mean_PN_duration=mean(PN, na.rm=T),
243		- median_PN_duration=median(PN, na.rm=T),
244		- mean_N_duration=mean(N, na.rm=T),
245		- median_N_duration=median(N, na.rm=T),
246		- number_data_points_N=n()-sum(is.na(N)),
247		- number_data_points_PN=n()-sum(is.na(PN))
248		- )%>%
249		- ungroup
250		-
251		-## save_excel(duration_stat_ms, "duration_by_MS.xlsx")
252		-
253	230
254	231
255		-duration_stat_eu <- duration_calc %>%
256		- ## group_by(legal_basis) %>%
257		- summarise(mean_duration=mean(total_duration, na.rm=T),
258		- median_duration=median(total_duration, na.rm=T),
259		- mean_PN_duration=mean(PN, na.rm=T),
260		- median_PN_duration=median(PN, na.rm=T),
261		- mean_N_duration=mean(N, na.rm=T),
262		- median_N_duration=median(N, na.rm=T),
263		- number_data_points_N=n()-sum(is.na(N)),
264		- number_data_points_PN=n()-sum(is.na(PN))) %>%
265		- ungroup
266		-
267		-## save_excel(duration_stat_eu, "duration_all_eu.xlsx")
268		-
269		-duration_stat_basis <- duration_calc %>%
270		- group_by(legal_basis) %>%
271		- summarise(mean_duration=mean(total_duration, na.rm=T),
272		- median_duration=median(total_duration, na.rm=T),
273		- mean_PN_duration=mean(PN, na.rm=T),
274		- median_PN_duration=median(PN, na.rm=T),
275		- mean_N_duration=mean(N, na.rm=T),
276		- median_N_duration=median(N, na.rm=T),
277		- number_data_points_N=n()-sum(is.na(N)),
278		- number_data_points_PN=n()-sum(is.na(PN)))%>%
279		- ungroup
280		-
281		-## save_excel(duration_stat_basis, "duration_by_legal_basis.xlsx")
282		-
283		-
284		-
285		-
286		-duration_stat_basis_ms <- duration_calc %>%
287		- group_by(member_state_2_letter_code,legal_basis) %>%
288		- summarise(mean_duration=mean(total_duration, na.rm=T),
289		- median_duration=median(total_duration, na.rm=T),
290		- mean_PN_duration=mean(PN, na.rm=T),
291		- median_PN_duration=median(PN, na.rm=T),
292		- mean_N_duration=mean(N, na.rm=T),
293		- median_N_duration=median(N, na.rm=T),
294		- number_data_points_N=n()-sum(is.na(N)),
295		- number_data_points_PN=n()-sum(is.na(PN)))%>%
296		- ungroup %>%
297		- arrange(member_state_2_letter_code)
298		-
299		-
300		-
301		-
302		-
303		-
304		-
305		-duration_stat_measure <- duration_calc %>%
306		- group_by(type_of_measure) %>%
307		- summarise(mean_duration=mean(total_duration, na.rm=T),
308		- median_duration=median(total_duration, na.rm=T),
309		- mean_PN_duration=mean(PN, na.rm=T),
310		- median_PN_duration=median(PN, na.rm=T),
311		- mean_N_duration=mean(N, na.rm=T),
312		- median_N_duration=median(N, na.rm=T),
313		- number_data_points_N=n()-sum(is.na(N)),
314		- number_data_points_PN=n()-sum(is.na(PN)))%>%
315		- ungroup
316		-
317		-
318		-
319		-
320		-duration_stat_measure_ms <- duration_calc %>%
321		- group_by(member_state_2_letter_code,type_of_measure) %>%
322		- summarise(mean_duration=mean(total_duration, na.rm=T),
323		- median_duration=median(total_duration, na.rm=T),
324		- mean_PN_duration=mean(PN, na.rm=T),
325		- median_PN_duration=median(PN, na.rm=T),
326		- mean_N_duration=mean(N, na.rm=T),
327		- median_N_duration=median(N, na.rm=T),
328		- number_data_points_N=n()-sum(is.na(N)),
329		- number_data_points_PN=n()-sum(is.na(PN)))%>%
330		- ungroup %>%
331		- arrange(member_state_2_letter_code)
332		-
333		-
334		-
335		-
336		-## save_excel(duration_stat_measure, "duration_by_legal_measure.xlsx")
337	232
338	233
339	234	aid_type <- tibble(code=c("AH", "IA", "S"),
340	235	name=c("Ad hoc", "Individual application within a scheme",
341	236	"Schemes"))
342	237
	238	+### budget broken by type
	239	+
343	240	budget_type <- rep1_unique %>%
344	241	group_by(## member_state_2_letter_code,
345	242	case_type_code) %>%

		@@ -352,6 +249,7 @@
352	249	add_total(nrow(.)+1)
353	250
354	251
	252	+### budget broken by type and MS
355	253
356	254	budget_type_ms <- rep1_unique %>%
357	255	group_by( member_state_2_letter_code,

		@@ -370,7 +268,7 @@
370	268
371	269
372	270
373		-
	271	+#### budget broken by measure
374	272
375	273
376	274	budget_measure <- rep1_unique %>%

		@@ -384,7 +282,7 @@
384	282
385	283
386	284
387		-
	285	+#### budget broken by measure and MS
388	286
389	287	budget_measure_ms <- rep1_unique %>%
390	288	group_by( member_state_2_letter_code,

		@@ -405,9 +303,107 @@
405	303
406	304
407	305
	306	+### budget broken by beneficiary sector
408	307
409	308
410	309
	310	+budget_by_beneficiary_sector <- rep1_unique %>%
	311	+ group_by(beneficary_sector) %>%
	312	+ summarise(budget=sum(confirmed_budgets, na.rm=T)) %>%
	313	+ ungroup %>%
	314	+ filter(complete.cases(.)) %>%
	315	+ mutate(share=budget/sum(budget)) %>%
	316	+ arrange(desc(budget))%>%
	317	+ add_total(nrow(.)+1)
	318	+
	319	+
	320	+### budget broken by MS and beneficiary sector
	321	+
	322	+budget_by_beneficiary_sector_ms <- rep1_unique %>%
	323	+ group_by(member_state_2_letter_code,beneficary_sector) %>%
	324	+ summarise(budget=sum(confirmed_budgets, na.rm=T)) %>%
	325	+ na.omit %>%
	326	+ mutate(share=budget/sum(budget)) %>%
	327	+ group_modify(~ .x %>%
	328	+ adorn_totals("row")) %>%
	329	+ ungroup%>%
	330	+ complete(member_state_2_letter_code,beneficary_sector) %>%
	331	+ na_to_pattern(0)
	332	+
	333	+
	334	+
	335	+
	336	+### budget broken by beneficiary type
	337	+
	338	+
	339	+budget_by_beneficiary_type <- rep1_unique %>%
	340	+ group_by(beneficary_type) %>%
	341	+ summarise(budget=sum(confirmed_budgets, na.rm=T)) %>%
	342	+ ungroup %>%
	343	+ filter(complete.cases(.)) %>%
	344	+ mutate(share=budget/sum(budget)) %>%
	345	+ arrange(desc(budget))%>%
	346	+ add_total(nrow(.)+1)
	347	+
	348	+
	349	+
	350	+### budget broken by beneficiary type and MS
	351	+
	352	+
	353	+budget_by_beneficiary_type_ms <- rep1_unique %>%
	354	+ group_by(member_state_2_letter_code,beneficary_type) %>%
	355	+ summarise(budget=sum(confirmed_budgets, na.rm=T)) %>%
	356	+ na.omit %>%
	357	+ mutate(share=budget/sum(budget)) %>%
	358	+ group_modify(~ .x %>%
	359	+ adorn_totals("row")) %>%
	360	+ ungroup%>%
	361	+ complete(member_state_2_letter_code,beneficary_type) %>%
	362	+ na_to_pattern(0)
	363	+
	364	+
	365	+
	366	+
	367	+
	368	+
	369	+
	370	+
	371	+
	372	+
	373	+
	374	+
	375	+
	376	+
	377	+
	378	+
	379	+
	380	+
	381	+
	382	+
	383	+
	384	+
	385	+
	386	+
	387	+
	388	+
	389	+
	390	+### statistics on the number of decisions.
	391	+## from the report file without duplicates, we remove those rows for which
	392	+## the legal basis is missing. We break down the remaining data by MS and
	393	+## we count the number of decisions for each MS (just the number of lines for each MS) and we add a total row
	394	+
	395	+decisions_test <- rep1_unique %>%
	396	+ filter(!is.na(legal_basis))%>%
	397	+ group_by(member_state_2_letter_code) %>%
	398	+ summarise(n_decisions=n()) %>%
	399	+ ungroup() %>%
	400	+ arrange(member_state_2_letter_code)%>%
	401	+ add_total(nrow(.)+1)
	402	+
	403	+
	404	+
	405	+### number of decisions by legal_basis
	406	+
411	407	decisions_eu <- rep1_unique %>%
412	408	filter(!is.na(legal_basis))%>%
413	409	group_by(legal_basis) %>%

		@@ -418,6 +414,8 @@
418	414	add_total(nrow(.)+1)
419	415
420	416
	417	+### number of decisions by legal_basis and MS
	418	+
421	419
422	420	decisions_eu_ms <- rep1_unique %>%
423	421	filter(!is.na(legal_basis))%>%

		@@ -436,6 +434,300 @@
436	434
437	435
438	436
	437	+
	438	+## statistics on the number of amendements. We use the first two columns of
	439	+## the amendments file. We break the amendment data by MS and we count the number of amendments for each MS as the number of unique amendment case numbers (in case some of them are repeated)
	440	+
	441	+
	442	+count_am <- df_am %>%
	443	+ group_by(member_state_of_amendment) %>%
	444	+ summarise(n_amendments=length(unique(amendment_case_number))) %>%
	445	+ ungroup%>%
	446	+ add_total(nrow(.)+1)
	447	+
	448	+
	449	+### this the fundamental table for the duration calculations
	450	+## we consider the full report and we filter the register code to be
	451	+## either N of PN and the case needs to have been decided (i.e. decided = YES)
	452	+## we select the columns case_reference, member_state_2_letter_code,
	453	+## average_duration, register_code, legal_basis, type_of_measure
	454	+## and we pivot the table.
	455	+
	456	+## Finally we obtain a table with the structure
	457	+
	458	+## case_reference member_state_2_… legal_basis type_of_measure N PN type total_duration
	459	+
	460	+## in which we have the duration of the notification and prenotification phase of all the decided cases
	461	+
	462	+duration_calc <- rep1 %>%
	463	+ filter(register_code %in% c("N", "PN"),
	464	+ decided=="YES") %>%
	465	+ select(case_reference, member_state_2_letter_code, average_duration,
	466	+ register_code, legal_basis, type_of_measure )%>%
	467	+ pivot_wider(names_from="register_code", values_from="average_duration") %>%
	468	+ ## na_to_pattern(0) %>%
	469	+ mutate(type=if_else(is.na(PN), "N", "P+N")) %>%
	470	+ rowwise() %>%
	471	+ mutate(total_duration=sum(N,PN, na.rm=T)) %>%
	472	+ ungroup
	473	+
	474	+
	475	+## save_excel(duration_calc, "duration_decisions_raw_data.xlsx")
	476	+
	477	+
	478	+
	479	+
	480	+
	481	+
	482	+
	483	+
	484	+## we break the duration_calc data by MS and we calculate statistics
	485	+## on the duration of the N, PN and combined N+PN process.
	486	+
	487	+duration_stat_ms <- duration_calc %>%
	488	+ group_by(member_state_2_letter_code) %>%
	489	+ summarise(mean_duration=mean(total_duration, na.rm=T),
	490	+ median_duration=median(total_duration, na.rm=T),
	491	+ mean_PN_duration=mean(PN, na.rm=T),
	492	+ median_PN_duration=median(PN, na.rm=T),
	493	+ mean_N_duration=mean(N, na.rm=T),
	494	+ median_N_duration=median(N, na.rm=T),
	495	+ number_data_points_N=n()-sum(is.na(N)),
	496	+ number_data_points_PN=n()-sum(is.na(PN))
	497	+ )%>%
	498	+ ungroup
	499	+
	500	+## save_excel(duration_stat_ms, "duration_by_MS.xlsx")
	501	+
	502	+
	503	+### As above, but without breaking the data by MS ---> we calculate the stats for the whole EU.
	504	+
	505	+duration_stat_eu <- duration_calc %>%
	506	+ ## group_by(legal_basis) %>%
	507	+ summarise(mean_duration=mean(total_duration, na.rm=T),
	508	+ median_duration=median(total_duration, na.rm=T),
	509	+ mean_PN_duration=mean(PN, na.rm=T),
	510	+ median_PN_duration=median(PN, na.rm=T),
	511	+ mean_N_duration=mean(N, na.rm=T),
	512	+ median_N_duration=median(N, na.rm=T),
	513	+ number_data_points_N=n()-sum(is.na(N)),
	514	+ number_data_points_PN=n()-sum(is.na(PN))) %>%
	515	+ ungroup
	516	+
	517	+## save_excel(duration_stat_eu, "duration_all_eu.xlsx")
	518	+
	519	+
	520	+#### Now the duration statistics is calculated by breaking the data according
	521	+## to the legal basis
	522	+
	523	+duration_stat_basis <- duration_calc %>%
	524	+ group_by(legal_basis) %>%
	525	+ summarise(mean_duration=mean(total_duration, na.rm=T),
	526	+ median_duration=median(total_duration, na.rm=T),
	527	+ mean_PN_duration=mean(PN, na.rm=T),
	528	+ median_PN_duration=median(PN, na.rm=T),
	529	+ mean_N_duration=mean(N, na.rm=T),
	530	+ median_N_duration=median(N, na.rm=T),
	531	+ number_data_points_N=n()-sum(is.na(N)),
	532	+ number_data_points_PN=n()-sum(is.na(PN)))%>%
	533	+ ungroup
	534	+
	535	+## save_excel(duration_stat_basis, "duration_by_legal_basis.xlsx")
	536	+
	537	+
	538	+## Now the data is broken both by legal basis and by duration statistics.
	539	+
	540	+duration_stat_basis_ms <- duration_calc %>%
	541	+ group_by(member_state_2_letter_code,legal_basis) %>%
	542	+ summarise(mean_duration=mean(total_duration, na.rm=T),
	543	+ median_duration=median(total_duration, na.rm=T),
	544	+ mean_PN_duration=mean(PN, na.rm=T),
	545	+ median_PN_duration=median(PN, na.rm=T),
	546	+ mean_N_duration=mean(N, na.rm=T),
	547	+ median_N_duration=median(N, na.rm=T),
	548	+ number_data_points_N=n()-sum(is.na(N)),
	549	+ number_data_points_PN=n()-sum(is.na(PN)))%>%
	550	+ ungroup %>%
	551	+ arrange(member_state_2_letter_code)
	552	+
	553	+
	554	+
	555	+
	556	+
	557	+#### now the duration statistics is broken by type of measure.
	558	+
	559	+duration_stat_measure <- duration_calc %>%
	560	+ group_by(type_of_measure) %>%
	561	+ summarise(mean_duration=mean(total_duration, na.rm=T),
	562	+ median_duration=median(total_duration, na.rm=T),
	563	+ mean_PN_duration=mean(PN, na.rm=T),
	564	+ median_PN_duration=median(PN, na.rm=T),
	565	+ mean_N_duration=mean(N, na.rm=T),
	566	+ median_N_duration=median(N, na.rm=T),
	567	+ number_data_points_N=n()-sum(is.na(N)),
	568	+ number_data_points_PN=n()-sum(is.na(PN)))%>%
	569	+ ungroup
	570	+
	571	+
	572	+### and now it is also further broken by MS.
	573	+
	574	+duration_stat_measure_ms <- duration_calc %>%
	575	+ group_by(member_state_2_letter_code,type_of_measure) %>%
	576	+ summarise(mean_duration=mean(total_duration, na.rm=T),
	577	+ median_duration=median(total_duration, na.rm=T),
	578	+ mean_PN_duration=mean(PN, na.rm=T),
	579	+ median_PN_duration=median(PN, na.rm=T),
	580	+ mean_N_duration=mean(N, na.rm=T),
	581	+ median_N_duration=median(N, na.rm=T),
	582	+ number_data_points_N=n()-sum(is.na(N)),
	583	+ number_data_points_PN=n()-sum(is.na(PN)))%>%
	584	+ ungroup %>%
	585	+ arrange(member_state_2_letter_code)
	586	+
	587	+
	588	+
	589	+
	590	+
	591	+
	592	+
	593	+
	594	+
	595	+
	596	+
	597	+
	598	+
	599	+
	600	+
	601	+
	602	+
	603	+
	604	+
	605	+
	606	+#### I keep only some columns of the duration calc table and I rename them
	607	+duration_calc_restricted <- duration_calc %>%
	608	+ select(case_reference,member_state_2_letter_code, N, PN) %>%
	609	+ na_to_pattern(0) %>%
	610	+ mutate(total_duration=N+PN) %>%
	611	+ select(-c(N, PN)) %>%
	612	+ rename("MS"="member_state_2_letter_code") %>%
	613	+ mutate(type="decision")
	614	+
	615	+
	616	+
	617	+
	618	+###file with the raw data on the duration of the amendments.
	619	+### it is important to know that when a PN notification has a duration of zero,
	620	+## it actually means that it had never taken place and so the zeros should be
	621	+## treated as missing data.
	622	+
	623	+## I also keep only the distinct amendment numbers to avoid double-counting
	624	+## the amendments.
	625	+
	626	+duration_calc_am <- df_am %>%
	627	+ distinct(amendment_case_number, .keep_all=T) %>%
	628	+ select(case_number_key , amendment_case_number,member_state_of_amendment,
	629	+ duration_of_n_phase_calendar_days, duration_of_pn_phase_calendar_days,total_duration_pn_n_calendar_days
	630	+ ) %>%
	631	+ pattern_to_na(0) %>%
	632	+ mutate(is_PN_plus_N=if_else(!is.na(duration_of_pn_phase_calendar_days),
	633	+ "yes", "no"))
	634	+
	635	+
	636	+### I now select only some columns of the duration_calc table
	637	+duration_calc_am_restricted <- duration_calc_am %>%
	638	+ select(case_number_key,member_state_of_amendment,total_duration_pn_n_calendar_days ) %>%
	639	+ rename("case_reference"="case_number_key",
	640	+ "MS"="member_state_of_amendment",
	641	+ "total_duration"="total_duration_pn_n_calendar_days") %>%
	642	+ mutate(type="amendment")
	643	+
	644	+
	645	+
	646	+#### Here I create a table where for every case I list the time to take a decision and the duration of each amendment process.
	647	+
	648	+duration_am_plus_decisions <- bind_rows(duration_calc_am_restricted,
	649	+ duration_calc_restricted) %>%
	650	+ arrange(MS, case_reference)
	651	+
	652	+save_excel(duration_am_plus_decisions, "duration_decisions_amendments_raw_data.xlsx")
	653	+
	654	+
	655	+## I calculate some statistics by MS on the collected data on durations and decisions.
	656	+
	657	+duration_am_plus_decisions_ms <- duration_am_plus_decisions %>%
	658	+ group_by(MS) %>%
	659	+ summarise(mean_duration=mean(total_duration),
	660	+ median_duration=median(total_duration)) %>%
	661	+ ungroup
	662	+
	663	+
	664	+## and now the same stats at the EU level.
	665	+duration_am_plus_decisions_eu <- duration_am_plus_decisions %>%
	666	+ ## group_by(MS) %>%
	667	+ summarise(mean_duration=mean(total_duration),
	668	+ median_duration=median(total_duration)) ## %>%
	669	+ ## ungroup
	670	+
	671	+
	672	+
	673	+### I now calculate the statistics on the duration of the amendments process
	674	+## at the MS level
	675	+
	676	+duration_stat_am_ms <- duration_calc_am %>%
	677	+ group_by(member_state_of_amendment) %>%
	678	+ summarise(mean_duration=mean(total_duration_pn_n_calendar_days, na.rm=T),
	679	+ median_duration=median(total_duration_pn_n_calendar_days, na.rm=T),
	680	+ mean_PN_duration=mean(duration_of_pn_phase_calendar_days ,
	681	+ na.rm=T),
	682	+ median_PN_duration=median(duration_of_pn_phase_calendar_days ,
	683	+ na.rm=T),
	684	+ mean_N_duration=mean(duration_of_n_phase_calendar_days, na.rm=T),
	685	+ median_N_duration=median(duration_of_n_phase_calendar_days, na.rm=T),
	686	+ number_data_points_N=n()-sum(is.na(duration_of_n_phase_calendar_days)),
	687	+ number_data_points_PN=n()-sum(is.na(duration_of_pn_phase_calendar_days))
	688	+ )%>%
	689	+ ungroup
	690	+
	691	+
	692	+
	693	+
	694	+## and now the same stats at the EU level.
	695	+
	696	+duration_stat_am_eu <- duration_calc_am %>%
	697	+ ## group_by(member_state_of_amendment) %>%
	698	+ summarise(mean_duration=mean(total_duration_pn_n_calendar_days, na.rm=T),
	699	+ median_duration=median(total_duration_pn_n_calendar_days, na.rm=T),
	700	+ mean_PN_duration=mean(duration_of_pn_phase_calendar_days ,
	701	+ na.rm=T),
	702	+ median_PN_duration=median(duration_of_pn_phase_calendar_days ,
	703	+ na.rm=T),
	704	+ mean_N_duration=mean(duration_of_n_phase_calendar_days, na.rm=T),
	705	+ median_N_duration=median(duration_of_n_phase_calendar_days, na.rm=T),
	706	+ number_data_points_N=n()-sum(is.na(duration_of_n_phase_calendar_days)),
	707	+ number_data_points_PN=n()-sum(is.na(duration_of_pn_phase_calendar_days))
	708	+ )## %>%
	709	+ ## ungroup
	710	+
	711	+
	712	+
	713	+
	714	+
	715	+
	716	+
	717	+
	718	+## save_excel(duration_stat_measure, "duration_by_legal_measure.xlsx")
	719	+
	720	+
	721	+
	722	+
	723	+
	724	+
	725	+
	726	+
	727	+
	728	+
	729	+### This time we just count the number of measures by legal basis
	730	+
439	731	measures_covid <- rep1_unique %>%
440	732	filter(decided=="YES", !is.na(legal_basis))%>%
441	733	group_by(legal_basis) %>%

		@@ -445,7 +737,7 @@
445	737	mutate(share=n_measures/sum(n_measures))%>%
446	738	add_total(nrow(.)+1)
447	739
448		-
	740	+### now number of measures by legal basis and MS
449	741
450	742	measures_covid_ms <- rep1_unique %>%
451	743	filter(decided=="YES", !is.na(legal_basis))%>%

		@@ -464,6 +756,7 @@
464	756
465	757
466	758
	759	+### Statistics on SA cases under assessment
467	760
468	761	sa_n_pn <- rep1_unique %>%
469	762	## filter(decided=="YES", !is.na(legal_basis),

		@@ -476,7 +769,7 @@
476	769	add_total(nrow(.)+1)
477	770
478	771
479		-
	772	+### as above, but also broken by MS
480	773	sa_n_pn_ms <- rep1_unique %>%
481	774	## filter(decided=="YES", !is.na(legal_basis),
482	775	## register_code %in% c("N", "PN"))%>%

		@@ -494,7 +787,7 @@
494	787
495	788
496	789
497		-
	790	+### and this time broken by Unit dealing with them
498	791	sa_n_pn_unit <- rep1_unique %>%
499	792	## filter(decided=="YES", !is.na(legal_basis),
500	793	## register_code %in% c("N", "PN"))%>%

		@@ -509,6 +802,7 @@
509	802
510	803
511	804
	805	+### now broken by both unit and MS
512	806
513	807	sa_n_pn_unit_ms <- rep1_unique %>%
514	808	## filter(decided=="YES", !is.na(legal_basis),

		@@ -529,65 +823,9 @@
529	823
530	824
531	825
532		-
533		-
534		-budget_by_beneficiary_sector <- rep1_unique %>%
535		- group_by(beneficary_sector) %>%
536		- summarise(budget=sum(confirmed_budgets, na.rm=T)) %>%
537		- ungroup %>%
538		- filter(complete.cases(.)) %>%
539		- mutate(share=budget/sum(budget)) %>%
540		- arrange(desc(budget))%>%
541		- add_total(nrow(.)+1)
542		-
543		-
544		-
545		-
546		-budget_by_beneficiary_sector_ms <- rep1_unique %>%
547		- group_by(member_state_2_letter_code,beneficary_sector) %>%
548		- summarise(budget=sum(confirmed_budgets, na.rm=T)) %>%
549		- na.omit %>%
550		- mutate(share=budget/sum(budget)) %>%
551		- group_modify(~ .x %>%
552		- adorn_totals("row")) %>%
553		- ungroup%>%
554		- complete(member_state_2_letter_code,beneficary_sector) %>%
555		- na_to_pattern(0)
556		-
557		-
558		-
559		-
560		-
561		-
562		-
563		-budget_by_beneficiary_type <- rep1_unique %>%
564		- group_by(beneficary_type) %>%
565		- summarise(budget=sum(confirmed_budgets, na.rm=T)) %>%
566		- ungroup %>%
567		- filter(complete.cases(.)) %>%
568		- mutate(share=budget/sum(budget)) %>%
569		- arrange(desc(budget))%>%
570		- add_total(nrow(.)+1)
571		-
572		-
573		-
574		-
575		-
576		-budget_by_beneficiary_type_ms <- rep1_unique %>%
577		- group_by(member_state_2_letter_code,beneficary_type) %>%
578		- summarise(budget=sum(confirmed_budgets, na.rm=T)) %>%
579		- na.omit %>%
580		- mutate(share=budget/sum(budget)) %>%
581		- group_modify(~ .x %>%
582		- adorn_totals("row")) %>%
583		- ungroup%>%
584		- complete(member_state_2_letter_code,beneficary_type) %>%
585		- na_to_pattern(0)
586		-
587		-
588		-
589		-
590		-
	826	+### from the rep1_unique table, we filter the decided cases with procedure
	827	+## either N or PN. then we collapse all the text in type_of_measure column and we count how many times we find there the words in tf_list (i.e. the TF chapters)
	828	+## we want to track.
591	829
592	830	chapters_stat_test <- rep1_unique %>%
593	831	filter(decided=="YES",

		@@ -599,18 +837,19 @@
599	837	mutate(share=value/sum(value))%>%
600	838	mutate(TF_chapter=tf_list)
601	839
602		-## test1 <- rep1_unique %>%
603		-## filter(decided=="YES",
604		-## register_code %in% c("N", "PN"))%>%
605		-## filter(grepl("TF3",type_of_measure) )
606	840
607		-## test1bis <- rep1_unique %>%
608		-## filter(case_reference=="SA.57574")
	841	+### The calculation above is correct but it does not allow one to see exactly
	842	+## to which TF chapter each case is associated to.
	843	+
609	844
610	845	tf <- rep1_unique %>%
611	846	filter(decided=="YES",
612		- register_code %in% c("N", "PN"))
613		-
	847	+ register_code %in% c("N", "PN")) ### I again take the decided
	848	+ ### case either N or PN as register_code
	849	+
	850	+### here I use map_df to iterate the filtering of the tf dataframe according
	851	+## to the tf_list of chapters
	852	+
614	853	tf_all <- map_df(tf_list, function(x) {filter(tf,grepl(x, type_of_measure)) %>%
615	854	mutate(TF=x) }) %>%
616	855	mutate(TF=remove_special_char(TF)) %>%

		@@ -623,6 +862,11 @@
623	862	## filter(grepl("TF1", type_of_measure))
624	863
625	864
	865	+### I need to treat separately the cases associated to more than 3 chapters
	866	+## because they are stored somewhere else. These calculations are along
	867	+## the lines of chapter_stat_test, but I need tf_list2 because the TF chapters
	868	+## are written a bit differently.
	869	+
626	870	chapters_stat_extra <- df_dec %>%
627	871	pull(specify_tf_others_more_than_three_sections_combined) %>%
628	872	paste0(collapse = " ") %>%

		@@ -631,6 +875,7 @@
631	875	mutate(share=value/sum(value))%>%
632	876	mutate(TF_chapter=tf_list)
633	877
	878	+### and once again, if I want to have the association between case and TF chapter, I run calculations similar to those for tf_all
634	879
635	880	tf_all2 <- map_df(tf_list2, function(x) {filter(df_dec,
636	881	grepl(x, specify_tf_others_more_than_three_sections_combined)) %>%

		@@ -640,14 +885,17 @@
640	885	rename("description"="specify_tf_others_more_than_three_sections_combined")
641	886
642	887
	888	+### and now I have the detailed situation for all cases and chapters
	889	+
643	890	all_chapter_cases <- bind_rows(tf_all, tf_all2)
644	891
645		-## save_excel(all_chapter_cases, "chapters_detailed.xlsx")
646		-
647	892
648	893
649	894
650	895
	896	+#### If I just need the statistics on all the TF chapters, I can add the count
	897	+## data on chapter usage from chapter_stat_test and chapter_stat_extra
	898	+
651	899
652	900	chapters_stat_fin <- chapters_stat_test %>%
653	901	select(TF_chapter, value) %>%

		@@ -657,6 +905,8 @@
657	905	mutate(TF_chapter=remove_special_char(TF_chapter))
658	906
659	907
	908	+#### I can finally use the detailed statistics in all_chapter cases
	909	+### to count the number of TF chapter per MS.
660	910
661	911	chapters_stat_fin_ms <- all_chapter_cases %>%
662	912	tabyl(member_state_code, TF) %>%

myprojects-hg-reborn Fork

コミット

タグ

よく使われているワード(クリックで追加)

コミットメタ情報

ログメッセージ

変更サマリ

差分

myprojects-hg-reborn
Fork