*****************************************************************************;
*************************** I by J Table Macro ***************************;
*****************************************************************************;
/*
notes/documentation
Inputs:
indata:Name of input dataset.
outdata:Name of output dataset(contains more than the automatic printed summary).
var1:First variable(character or numeric) for an i by j contingency table analysis.
var2Second variable(character or numeric) for an i by j contingency table analysis.
print:Set to "Y" by default, specifying anything else will suppress the printed summary.
exact:Set to "N" by default, must specify "Y" to add the Fisher Exact test to the output.
"Y" and "N" are the only allowable inputs, other values will produce an error.
Exact test is not advised with "very large" datasets, requires too much computing.
*/
/***
example calls:
%include 'C:\Documents and Settings\ThostensonJeffD\My Documents\sas_macros\macro_ibyj.sas'; run;
Does not produce Fisher's Exact test since default option is N:
%ibyj(indata=testing, outdata=t3, var1=med_code_n, var2=sex_c, print=Y); run;
Produces Fisher's Exact test:
%ibyj(indata=testing, outdata=t3, var1=med_code_n, var2=sex_c, print=Y, exact=Y); run;
Does not produce Fisher's Exact test:
%ibyj(indata=testing, outdata=t3, var1=med_code_n, var2=sex_c, print=Y, exact=N); run;
Produces error, only options for exact are N or Y:
%ibyj(indata=testing, outdata=t3, var1=med_code_n, var2=sex_c, print=Y, exact=n); run;
%ibyj(indata=testing, outdata=t3, var1=med_code_n, var2=sex_c, print=Y, exact=yes); run;
***/
*------*;
*------IBYJ macro --- START------*;
%macro ibyj(indata=, outdata=, var1=, var2=, print=Y, exact=N);
/* Create temporary dataset from input dataset */
data _temp;
set &indata;
run;
/* Close listing for output */
run; ods listing close; run;
*------*;
*Run proc contents to pull format information;
proc contents data=_temp;
ods output Variables=_contents; run;
data _contents(keep= variable format label type);
set _contents;
if variable=compress("&var1",' ') then do;
call symput('format1',format);
call symput('type1',type);
end;
if variable=compress("&var2",' ') then do;
call symput('format2',format);
call symput('type2',type);
end;
run;
**--if var1 or var2 is numeric, change to character--**;
*-case 1: variable 1 is a formatted numeric variable-*;
%if &type1=Num & &format1^= %then %do;
*rename input numeric variable to make way for character-coerced version;
data _temp; set _temp; rename &var1=_temp1;
*create character-coerced version of numeric input variable;
data _temp;
set _temp;
&var1=put(_temp1,&format1);
run;
%end;
*-case 1: variable 1 is a formatted numeric variable-*;
*-case 2: variable 1 is an unformatted numeric variable-*;
%if &type1=Num & &format1= %then %do;
data _temp; set _temp; rename &var1=_temp1;
data _temp;
set _temp;
&var1=compress(_temp1,' ');
run;
%end;
*-case 2: variable 1 is an unformatted numeric variable-*;
*-case 3: variable 2 is a formatted numeric variable-*;
%if &type2=Num & &format2^= %then %do;
data _temp; set _temp; rename &var2=_temp2;
data _temp;
set _temp;
&var2=put(_temp2,&format2);
run;
%end;
*-case 3: variable 2 is a formatted numeric variable-*;
*-case 4: variable 2 is an unformatted numeric variable-*;
%if &type2=Num & &format2= %then %do;
data _temp; set _temp; rename &var2=_temp2;
data _temp;
set _temp;
&var2=compress(_temp2,' ');
run;
%end;
*-case 4: variable 2 is an unformatted numeric variable-*;
**--if var1 or var2 is numeric, change to character--**;
*------*;
*------*;
*Run proc freq to get counts, percents, and statistical test output;
*With exact test;
%if &exact=Y %then %do;
proc freq data=_temp;
where compress(&var1,' ')^='.' & compress(&var2,' ')^='.';
tables &var1*&var2 / chisq fisher;
ods output CrossTabFreqs=_ctf ChiSq=_cs FishersExact=_fe;
%end;
*Without exact test;
%if &exact ne Y %then %do;
proc freq data=_temp;
where compress(&var1,' ')^='.' & compress(&var2,' ')^='.';
tables &var1*&var2 / chisq;
ods output CrossTabFreqs=_ctf ChiSq=_cs;
%end;
*------*;
/* Re-open listing for output */
run; ods listing; run;
*------*;
*-Calculate expected cell counts to check for Chi-square warning-*;
*observed cell counts dataset;
data _observed(keep= table &var1 &var2 observed); set _ctf; if _type_=11; observed=frequency; run;
*ti (variable 1 totals) dataset;
data _ti(keep= table &var1 ti); set _ctf; if _type_=10; ti=frequency; run;
*tj (variable 2 totals) dataset;
data _tj(keep= table &var2 tj); set _ctf; if _type_=01; tj=frequency; run;
*tn (table total) dataset;
data _n(keep= table n); set _ctf; if _type_=00; n=frequency; run;
*merge observed with the ti, tj, and n datasets to calculate expected cell counts;
proc sort data=_observed; by table &var1;
proc sort data=_ti; by table &var1;
data _expected; merge _observed _ti; by table &var1; run;
proc sort data=_expected; by table &var2;
proc sort data=_tj; by table &var2;
data _expected; merge _expected _tj; by table &var2; run;
*calculate expected cell counts;
proc sort data=_expected; by table;
proc sort data=_n; by table;
data _expected;
merge _expected _n;
by table;
expected=(ti*tj)/n;
if expected<5 then flag=1; else flag=0;
dummy=1;
run;
*calculate percentage of cells with expected counts less than 5;
proc sort data=_expected; by table;
data _warning(keep= warning_percent first);
set _expected; by table;
retain flags total;
first=1;
if first.table=1 then do;
flags=0; total=0;
end;
flags=flags+flag;
total=total+dummy;
warning_percent=flags/total;
if last.table;
format warning_percent percent6.1;
run;
*------*;
*------*;
*-Counts & Percents output datasets-*;
data _ctf(drop= table _type_ _table_ percent colpercent missing);
set _ctf;
if _type_=11;
length varname1 $ 15 varname2 $ 15 ;*text1 $ 20 text2 $ 20;
*-names of variables-*;
varname1=compress(scan(table,2,' '),' ');
varname2=compress(scan(table,2,'*'),' ');
text1=&var1;
text2=&var2;
format rowpercent 8.1;
run;
*-Counts & Percents output datasets-*;
*------*;
*------*;
*-Chi-square & Fisher p-value output datasets-*;
data _cs(keep= table cs csdf csp);
set _cs;
if statistic='Chi-Square';
cs=value;
csdf=df;
csp=prob;
format cs 8.2 csp pvalue6.4;
run;
*With Fisher Exact test;
%if &exact=Y %then %do;
data _fe(keep= table fisherp);
set _fe;
if name1='XP2_FISH';
fisherp=nvalue1;
format fisherp pvalue6.4;
run;
proc sort data=_cs; by table;
proc sort data=_fe; by table;
data _csfe(drop= table);
merge _cs _fe;
by table;
length varname1 $ 15 varname2 $ 15;
varname1=compress(scan(table,2,' '),' ');
varname2=compress(scan(table,2,'*'),' ');
run;
%end;
*Without Fisher Exact test;
%if &exact=N %then %do;
data _csfe(drop= table);
set _cs;
length varname1 $ 15 varname2 $ 15;
varname1=compress(scan(table,2,' '),' ');
varname2=compress(scan(table,2,'*'),' ');
fisherp=.;
run;
%end;
*-Chi-square & Fisher p-value output datasets-*;
*------*;
*------*;
*-Merge Counts/Percents & Chi-square/Fisher output datasets-*;
*create dummy 'first' variable;
proc sort data=_ctf; by varname1 varname2 &var1 text2;
proc sort data=_csfe; by varname1 varname2;
data _ctf; set _ctf; by varname1 varname2; if first.varname2 then first=1;
data _csfe; set _csfe; by varname1 varname2; if first.varname2 then first=1;
*merge CTF with warning dataset;
proc sort data=_ctf; by first;
proc sort data=_warning; by first;
data _ctf;
merge _ctf _warning;
by first;
*merge CTF & CSFE;
proc sort data=_ctf; by varname1 varname2 first;
proc sort data=_csfe; by varname1 varname2 first;
data &outdata(drop= &var1 &var2);
merge _ctf _csfe;
by varname1 varname2 first;
label cs='Chi-Square' csdf='Chi-Square DF' csp='Chi-Square p-value' fisherp='Fisher p-value'
frequency='N' rowpercent='%' varname1='Var. 1' varname2='Var. 2' text1='Level(1)' text2='Level(2)'
warning_percent='% of Cells in table with Expected counts<5';
rename frequency=count rowpercent=percent;
run;
*-Merge Counts/Percents & Chi-square/Fisher output datasets-*;
*------*;
*------*;
**--re-add numeric value variables afterwards for those variables that were originally numeric--**;
*-case 1: variable 1 was numeric variable-*;
%if &type1=Num %then %do;
run; ods listing close; run;
proc freq data=_temp; tables _temp1; ods output OneWayFreqs=_owf1;
run; ods listing; run;
data _owf1(keep= F__temp1 _temp1); set _owf1; f__temp1=left(f__temp1); run;
data _owf1; set _owf1; rename F__temp1=text1 _temp1=value1; run;
data _owf1(drop= text1); set _owf1; mergeby1=compress(text1,' ');
data &outdata; set &outdata; mergeby1=compress(text1,' ');
proc sort data=_owf1; by mergeby1;
proc sort data=&outdata; by mergeby1;
data &outdata;*(drop= mergeby1);
merge &outdata(in=in1) _owf1;
by mergeby1;
label value1='Level(1)';
run;
%end;
*-case 1: variable 1 was numeric variable-*;
*-case 2: variable 2 was numeric variable-*;
%if &type2=Num %then %do;
run; ods listing close; run;
proc freq data=_temp; tables _temp2; ods output OneWayFreqs=_owf2;
run; ods listing; run;
data _owf2(keep= F__temp2 _temp2); set _owf2; f__temp2=left(f__temp2); run;
data _owf2; set _owf2; rename F__temp2=text2 _temp2=value2; run;
data _owf2(drop= text2); set _owf2; mergeby2=compress(text2,' ');
data &outdata; set &outdata; mergeby2=compress(text2,' ');
proc sort data=_owf2; by mergeby2;
proc sort data=&outdata; by mergeby2;
data &outdata;*(drop= mergeby2);
merge &outdata(in=in1) _owf2;
by mergeby2;
label value2='Level(2)';
run;
%end;
*-case 2: variable 2 was numeric variable-*;
*------*;
*------*;
*-(optional) printed output-*;
*print with the Fisher Exact test p-value;
%if &print=Y & &exact=Y %then %do;
proc sort data=&outdata; by varname1 varname2 descending first text1 text2;
proc print data=&outdata l;
by varname1 varname2;
id varname1 varname2;
var text1 text2 count percent cs csdf csp fisherp warning_percent;
*title; title "If % of Cells with Expected counts<5 is greater than 25%, Chi-square test is not advised"; run; title; *footnote;
%end;
*print without the Fisher Exact test p-value;
%if &print=Y & &exact=N %then %do;
proc sort data=&outdata; by varname1 varname2 descending first text1 text2;
proc print data=&outdata l;
by varname1 varname2;
id varname1 varname2;
var text1 text2 count percent cs csdf csp warning_percent;
*title; title "If % of Cells with Expected counts<5 is greater than 25%, Chi-square test is not advised"; run; title; *footnote;
%end;
*-(optional) printed output-*;
*------*;
*------*;
*delete temporary working datasets in macro;
proc datasets library=work nolist nowarn;
delete _temp _ctf _cs _fe _warning _contents _observed _ti _tj _n _expected _csfe _temp _owf1 _owf2;
run;
quit;
*------*;
%mend;
run;
*------IBYJ macro --- END------*;
*------*;