/***************************************************************************** /* /* Program: CODEBOOK.SAS /* Author: Greg Silva (11/16/1998) /* /*---------------------------------------------------------------------- /* Copyright(c) 1999, PPD Development. /* All Rights Reserved. /* /*---------------------------------------------------------------------- /* To paraphrase standard licensing agreements: /* /* THIS PROGRAM IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, /* EITHER EXPRESSED OR IMPLIED. THE ENTIRE RISK AS TO THE QUALITY AND /* PERFORMANCE OF THIS PROGRAM IS WITH YOU. /* /* PLEASE LEAVE ALL HEADER INFORMATION IN THE PROGRAM. /*---------------------------------------------------------------------- /* /* NOTE: This program was written to run on the OpenVMS platform. Be /* sure to check your log for any options that may not be valid /* on your operating system. /* /* /* Description: This program (based on orignal work from Mike Witt) /* prints summary statistics for all variables in /* a specified dataset. It also provides variable /* information (length, format, type). /*---------------------------------------------------------------------- /* Modified: /* /* /*---------------------------------------------------------------------- /* /* Parameters /* /* Parameter Description /* --------- --------------------------------------------------- /* LIBNAME LIBNAME defined for the dataset. /* /* STUDY Identifier used in the output header. /* /* DSNAME Name of dataset to report on. /* /* FREQ Produce frequency information? (YES/NO) /* /* OUTTYPE Format of output. (TEXT/HTML) /* /* FORMATS Has a format library been assigned to the libname /* LIBRARY? (YES/NO) /* /* OUTDSN Name of the dataset to store output from CODEBOOK in. /* /* DIRTYPE Description of the type of data you are producing. /* /* TOTFREQ Total number of frequencies to produce. 10 = the 5 /* lowest + the 5 highest unique values. /* /* FIRSTBK Flag to identify whether this is the first codebook /* you are running. Set to 1 for the first book, set /* to 0 for all others. This uses the MOD parameter /* in the PRINTTO to append output. /* /* KEEP List of variables to report on. If blank, then the /* entire dataset is used. The list is space delimited. /* /*****************************************************************************/ %macro codebook(libname = BIO, study = CLIENT_X, dsname =, freq = YES, outtype = TEXT, formats = YES, filepath = , outdsn = _NULL_, dirtype = CREATED, totfreq = 10, firstbk = 1, keep = ) ; %let halffreq = %eval(&totfreq / 2) ; %* Default to TEXT. %if (%upcase(&outtype) ^= HTML) %then %let outtype = TEXT; %if (%str(&keep) ^= ) %then %let keepstr = (keep = &keep ) ; %else %let keepstr = ; *** Get the dataset information into a dataset. ; proc contents data = &libname..&dsname &keepstr noprint out = cont (keep = name type memname nobs modate crdate label length type formatl formatd format) ; run; *** Get the list of variables of each type. ; data nums (keep = name ) chars (keep = name ) ; set cont (keep = name type nobs) end = eof; by name ; if (type = 1) then do ; _numcnt ++ 1 ; call symput('nv' || left(put(_numcnt, 5.)), compress(name)) ; output nums ; end ; else if (type = 2) then do ; _charcnt ++ 1 ; call symput('cv' || left(put(_charcnt, 5.)), compress(name)) ; output chars ; end ; __cnt ++ 1 ; *** Store the total number of variables. ; if (eof) then do ; call symput('numvars', left(put(__cnt, 5.))) ; call symput('charcnt', left(put(_charcnt, 5.))) ; call symput('numcnt' , left(put(_numcnt, 5.))) ; call symput('nobs', left(put(nobs, 8.))) ; end ; run ; %*** Check for numeric variables before running. ; %if (&numcnt > 0) %then %do ; %* Do the numeric first. ; %let type = NUM; *** Build the univariate code with the list of variable name. ; proc univariate data = &libname..&dsname &keepstr noprint ; var %do i = 1 %to &numcnt ; &&nv&i %end ; ; output out = temp n = %do i = 1 %to &numcnt ; n&i %end ; nmiss = %do i = 1 %to &numcnt ; nmis&i %end ; min = %do i = 1 %to &numcnt ; min&i %end ; max = %do i = 1 %to &numcnt ; max&i %end ; mean = %do i = 1 %to &numcnt ; mean&i %end ; median = %do i = 1 %to &numcnt ; med&i %end ; ; run ; *** Univariate creates a single record, so split it out to a record for each variable. ; data &type.u (keep = name n nmiss min max mean median ); set temp ; length name $ 8 ; %do i = 1 %to &numcnt ; name = "&&nv&i" ; n = n&i ; nmiss = nmis&i ; min = min&i ; max = max&i ; mean = mean&i ; median = med&i ; output ; %end ; run ; proc sort data = &type.u ; by name ; run ; *** Transpose the character variables. In this case, the new dataset contains ; *** the variable name in the _NAME_ variable, and the value in the COL1 variable. ; data &type.DS ; set &libname..&dsname (keep = %do i = 1 %to &numcnt; &&nv&i %end ; ) ; __cntr__ = _N_ ; run ; proc transpose data = &type.DS out = &type ; by __cntr__ ; var %do i = 1 %to &numcnt; &&nv&i %end ; ; run; proc sort data = &type (rename = (_name_ = name) drop = __CNTR__); by name col1 ; run ; *** Perform a freq to cut down on the number of observations for ; *** the next data step. ; proc freq data = &type noprint ; by name ; table col1 /out = test missing; run ; data test ; merge test &type.u ; by name ; run; data allnum (keep = name _text _line _form nmin nmax nmean nmed unique nmiss n label length) ; merge test (in=in1) end= eof cont (keep = name length format formatl label) ; by name ; retain _cnt_ ; length form _form $10 _text $160 xfreq $20 ; %if (&freq ^= NO) %then %do ; array _numz_ {500000} _temporary_ ; array _pctz_ {500000} _temporary_ ; array _cntz_ {500000} _temporary_ ; %end ; *** Drop the non-numeric variables. ; if (in1) ; *** Count the unique values. ; if (first.name) then _cnt_ = 0 ; _cnt_ ++ 1; %if (&freq ^= NO) %then %do ; *** Store the unique values. ; _numz_{_cnt_} = col1 ; _pctz_{_cnt_} = percent ; _cntz_{_cnt_} = count ; %end ; if (last.name) then do; unique = _cnt_ ; *** Output the variables used for the Web page. Add formats if they exist. ; if (format ^= "") then do ; if (formatl > 0) then form = trim(left(format)) || trim(left(formatl)) || '.' ; else form = trim(left(format)) || '.' ; _form = left(form) ; end; if (format ^= "" and "&formats" = "YES") then do ; *** Use the putn function to assign formats at runtime. ; nmin = putn(min, form) ; nmax = putn(max, form) ; nmean = putn(mean, form) ; nmed = putn(median, form) ; name = left(name) ; _text = "Minimum ..................... " || put(min, best8.) || " (" || trim(left(nmin)) || ")" ; _line = -10 ; output ; _text = "Maximum ..................... " || put(max, best8.) || " (" || trim(left(nmax)) || ")" ; _line = -9 ; output ; _text = "Mean ........................ " || put(mean, best8.) || " (" || trim(left(nmean)) || ")" ; _line = -8 ; output ; _text = "Median ...................... " || put(median, best8.) || " (" || trim(left(nmed)) || ")" ; _line = -7 ; output ; _text = "Number unique ............... " || put(_cnt_, best8.) ; _line = -6 ; output ; _text = "Number missing .............. " || put(nmiss, best8.) ; _line = -5 ; output ; _text = "Number non-missing .......... " || put(n, best8.) ; _line = -4 ; output ; %if (&freq ^= NO) %then %do ; if (_cnt_ > 2) then do ; _text = "Unique value Count Percent"; _line = 0; output; *** Change for parameter with number of unique. ; if (_cnt_ <= &totfreq) then do ; do xx = 1 to _cnt_ ; _text = ".............................." || put(_cntz_{xx}, best8.) || " " || put(_pctz_{xx}, 4.1) || '%' ; if (_numz_{xx} = .) then substr(_text,1,10) = "-missing- " ; else do ; xfreq = putn(_numz_{xx}, form) ; substr(_text,1,length(trim(left(xfreq)))+1) = trim(left(xfreq)) || " " ; end ; _line = xx; output; end ; end ; else do ; *** Change for parameter with number of unique. ; do xx = 1 to &halffreq ; _text = ".............................." || put(_cntz_{xx}, best8.) || " " || put(_pctz_{xx}, 4.1) || '%' ; if (_numz_{xx} = .) then substr(_text,1,10) = "-missing- " ; else do ; xfreq = putn(_numz_{xx}, form) ; substr(_text,1,length(trim(left(xfreq)))+1) = trim(left(xfreq)) || " " ; end ; _line = xx; output; end ; _text = " ---- &halffreq highest values ---- " ; *** Change for parameter with number of unique. ; _line = &halffreq + 1 ; output ; *** Change for parameter with number of unique. ; zcnt = (_cnt_ - &halffreq) + 1 ; do xx = zcnt to _cnt_; _text = ".............................." || put(_cntz_{xx}, best8.) || " " || put(_pctz_{xx}, 4.1) || '%' ; if (_numz_{xx} = .) then substr(_text,1,10) = "-missing- " ; else do ; xfreq = putn(_numz_{xx}, form) ; substr(_text,1,length(trim(left(xfreq)))+1) = trim(left(xfreq)) || " " ; end ; _line = xx ; output; end ; end; end ; %end; end ; else do ; nmin = put(min, best8.) ; nmax = put(max, best8.) ; nmean = put(mean, best8.) ; nmed = put(median, best8.) ; name = left(name) ; if (format = "") then _form = "" ; _text = "Minimum ..................... " || put(min, best8.) ; _line = -10 ; output ; _text = "Maximum ..................... " || put(max, best8.) ; _line = -9 ; output ; _text = "Mean ........................ " || put(mean, best8.) ; _line = -8 ; output ; _text = "Median ...................... " || put(median, best8.) ; _line = -7 ; output ; _text = "Number unique ............... " || put(_cnt_, best8.) ; _line = -6 ; output ; _text = "Number missing .............. " || put(nmiss, best8.) ; _line = -5 ; output ; _text = "Number non-missing .......... " || put(n, best8.) ; _line = -4 ; output ; %if (&freq ^= NO) %then %do ; if (_cnt_ > 2) then do ; _text = "Unique value Count Percent"; _line = 0; output; *** Change for parameter with number of unique. ; if (_cnt_ <= &totfreq ) then do ; do xx = 1 to _cnt_ ; _text = ".............................." || put(_cntz_{xx}, best8.) || " " || put(_pctz_{xx}, 4.1) || '%' ; if (_numz_{xx} = .) then substr(_text,1,10) = "-missing- " ; else substr(_text,1,length(compress(put(_numz_{xx}, best8.)))+1) = compress(put(_numz_{xx}, best8.)) || " " ; _line = xx ; output; end ; end ; else do ; *** Change for parameter with number of unique. ; do xx = 1 to &halffreq ; _text = ".............................." || put(_cntz_{xx}, best8.) || " " || put(_pctz_{xx}, 4.1) || '%' ; if (_numz_{xx} = .) then substr(_text,1,10) = "-missing- " ; else substr(_text,1,length(compress(put(_numz_{xx}, best8.)))+1) = compress(put(_numz_{xx}, best8.)) || " " ; _line = xx ; output; end ; _text = " ---- &halffreq highest values ---- " ; *** Change for parameter with number of unique. ; _line = &halffreq + 1 ; output ; *** Change for parameter with number of unique. ; zcnt = (_cnt_ - &halffreq) + 1; do xx = zcnt to _cnt_; _text = ".............................." || put(_cntz_{xx}, best8.) || " " || put(_pctz_{xx}, 4.1) || '%' ; substr(_text,1,length(compress(put(_numz_{xx}, best8.)))+1) = compress(put(_numz_{xx}, best8.)) || " " ; _line = xx ; output; end ; end; end ; %end ; end ; end ; run ; proc sort data = allnum ; by name _line ; run; %end ; %*** If numeric variables. ; %*** Check for character variables before running. ; %if (&charcnt > 0) %then %do ; %let type = CHAR; data &type.DS ; set &libname..&dsname (keep = %do i = 1 %to &charcnt; &&cv&i %end ; ) ; __cntr__ = _N_ ; run ; proc transpose data = &type.DS out = &type ; by __cntr__ ; var %do i = 1 %to &charcnt; &&cv&i %end ; ; run; proc sort data = &type (rename = (_name_ = name) drop = __CNTR__ ); by name col1 ; run ; *** Perform a freq to cut down on the number of observations for ; *** the next data step. ; proc freq data = &type noprint ; by name ; table col1 /out = test missing; run ; data allchar (keep = name _line _text _form xmin xmax n unique nmiss format length ) ; merge test (in=in1) end=eof cont (keep = name length format formatl label) ; by name ; length min max xfreq $40 xmin xmax $40 form _form $10 _text $160; retain min nmiss n total unique ; %if (&freq ^= NO) %then %do ; array _charz_ {200000} $40 _temporary_ ; array _pctz_ {200000} _temporary_ ; array _cntz_ {200000} _temporary_ ; %end ; if (in1) ; *** Calculate the stats by hand. Initialize all counters for each variable. ; if (first.name) then do ; n = 0 ; min = "" ; nmiss = 0 ; total = 0 ; unique = 0 ; end ; unique = unique + 1 ; %if (&freq ^= NO) %then %do ; *** Store the unique values. ; _charz_{unique} = col1 ; _pctz_{unique} = percent ; _cntz_{unique} = count ; %end ; *** Use a blank as missing values. ; if (compress(col1) = "") then do ; nmiss = nmiss + count ; end ; else do ; n = n + count ; if (compress(min) = "") then min = left(col1) ; end ; *** If the last record for the variable, then calculate max, and create output variables for Web. ; if (last.name) then do ; max = left(col1) ; if (unique = 1) then min = max ; if (format ^= "") then do ; if (formatl > 0) then form = trim(left(format)) || trim(left(formatl)) || '.' ; else form = trim(left(format)) || '.' ; _form = left(form) ; end ; if (format ^= "" and "&formats" = "YES") then do ; * Use the putc function to assign formats at runtime. ; xmin = putc(min, form) ; xmax = putc(max, form) ; name = left(name) ; _text = "Minimum ................... " || put(min, $char20.) || " ( " || trim(left(xmin)) || ")" ; _line = -10 ; output ; _text = "Maximum ................... " || put(max, $char20.) || " ( " || trim(left(xmax)) || ")" ; _line = -9 ; output ; _text = "Number unique ............. " || put(unique, best8.) ; _line = -8 ; output ; _text = "Number missing ............ " || put(nmiss, best8.) ; _line = -7 ; output ; _text = "Number non-missing ........ " || put(n, best8.) ; _line = -6 ; output ; %if (&freq ^= NO) %then %do ; if (unique > 2) then do ; _text = "Unique value Count Percent"; _line = 0; output; *** Change for parameter with number of unique. ; if (unique <= &totfreq) then do ; do xx = 1 to unique ; _text = "..................................................." || put(_cntz_{xx}, best8.) || " " || put(_pctz_{xx}, 4.1) || '%' ; if (compress(_charz_{xx}) = "") then substr(_text,1,8) = "-blank- " ; else do ; xfreq = putc(_charz_{xx}, form) ; substr(_text,1,length(trim(left(xfreq)))+1) = trim(left(xfreq)) || " " ; end ; _line = xx; output; end ; end ; else do ; *** Change for parameter with number of unique. ; do xx = 1 to &halffreq ; _text = "..................................................." || put(_cntz_{xx}, best8.) || " " || put(_pctz_{xx}, 4.1) || '%' ; if (compress(_charz_{xx}) = "") then substr(_text,1,8) = "-blank- " ; else do ; xfreq = putc(_charz_{xx}, form) ; substr(_text,1,length(trim(left(xfreq)))+1) = trim(left(xfreq)) || " " ; end ; _line = xx; output; end ; _text = " ---- &halffreq highest values ---- " ; *** Change for parameter with number of unique. ; _line = &halffreq + 1 ; output ; *** Change for parameter with number of unique. ; zcnt = (unique - &halffreq) + 1; do xx = zcnt to unique; _text = "..................................................." || put(_cntz_{xx}, best8.) || " " || put(_pctz_{xx}, 4.1) || '%' ; if (compress(_charz_{xx}) = "") then substr(_text,1,8) = "-blank- " ; else do ; xfreq = putc(_charz_{xx}, form) ; substr(_text,1,length(trim(left(xfreq)))+1) = trim(left(xfreq)) || " " ; end ; _line = xx ; output; end ; end; end ; %end ; end ; else do ; xmin = put(min, $char40.) ; xmax = put(max, $char40.) ; name = left(name) ; if (format = "") then _form = "" ; _text = "Minimum ................... " || put(min, $char40.) ; _line = -10 ; output ; _text = "Maximum ................... " || put(max, $char40.) ; _line = -9 ; output ; _text = "Number unique ............. " || put(unique, best8.) ; _line = -8 ; output ; _text = "Number missing ............ " || put(nmiss, best8.) ; _line = -7 ; output ; _text = "Number non-missing ........ " || put(n, best8.) ; _line = -6 ; output ; %if (&freq ^= NO) %then %do ; if (unique > 2) then do ; _text = "Unique value Count Percent"; _line = 0; output; *** Change for parameter with number of unique. ; if (unique <= &totfreq) then do ; do xx = 1 to unique ; _text = "..................................................." || put(_cntz_{xx}, best8.) || " " || put(_pctz_{xx}, 4.1) || '%' ; if (compress(_charz_{xx}) = "") then substr(_text,1,8) = "-blank- " ; else substr(_text,1,length(trim(left(_charz_{xx})))+1) = trim(left(_charz_{xx})) || " " ; _line = xx; output; end ; end ; else do ; *** Change for parameter with number of unique. ; do xx = 1 to &halffreq ; _text = "..................................................." || put(_cntz_{xx}, best8.) || " " || put(_pctz_{xx}, 4.1) || '%' ; if (compress(_charz_{xx}) = "") then substr(_text,1,8) = "-blank- " ; else substr(_text,1,length(trim(left(_charz_{xx})))+1) = trim(left(_charz_{xx})) || " " ; _line = xx; output; end ; _text = " ---- &halffreq highest values ---- " ; *** Change for parameter with number of unique. ; _line = &halffreq + 1 ; output ; *** Change for parameter with number of unique. ; zcnt = (unique - &halffreq) + 1; do xx = zcnt to unique; _text = "..................................................." || put(_cntz_{xx}, best8.) || " " || put(_pctz_{xx}, 4.1) || '%' ; if (compress(_charz_{xx}) = "") then substr(_text,1,8) = "-blank- " ; else substr(_text,1,length(trim(left(_charz_{xx})))+1) = trim(left(_charz_{xx})) || " " ; _line = xx ; output; end ; end; end ; %end ; end ; end; run ; proc sort data = allchar ; by name _line ; run; %end ; %*** If character variables. ; *** Get the characters and numerics together. ; data main ; set %if (&numcnt > 0) %then %do ; allnum %end ; %if (&charcnt > 0) %then %do ; allchar %end ; ; by name _line ; *** We have to remove the < and > from any formatted ; *** numerics or character strings for HTML. ; %if (&outtype = HTML) %then %do ; gt = index(_text, '>') ; if (gt > 0) then _text = substr(_text, 1, gt -1) || '>' || substr(_text, gt +1) ; lt = index(_text, '<') ; if (lt > 0) then _text = substr(_text, 1, lt -1) || '<' || substr(_text, lt +1) ; drop lt gt ; %end ; run ; proc sort data=cont; by name; run; %*** TEXT output section. ; %if (&outtype = TEXT) %then %do ; *options cc=cr nodate nonumber pagesize=32000; options nodate nonumber pagesize=32000; title "Codebook for &study" ; data &outdsn ; merge main (in = one) cont (in = two); by name; if one=0 or two=0 then put 'Error: Major Linking Problem ' name= one= two=; %if (&firstbk = 1) %then %do ; file "&filepath.&study..txt" print; %end ; %else %do ; file "&filepath.&study..txt" print mod ; %end ; length _text1-_text3 $50.; if _n_=1 then do; _text1 = trim(left(put(nobs,comma8.0))); temp = datepart(crdate); _text2 = trim(left(put(temp,weekdate37.))) || " " || put(crdate,tod8.); temp = datepart(modate); _text3 = trim(left(put(temp,weekdate37.))) || " " || put(modate,tod8.); moddate = put(temp, mmddyy10.) ; %* Start with header information. ; put _page_ ; todayis = put(today(), mmddyy10.) ; put "Codebook For: &study Directory: &dirtype Dataset: " memname / / / "Dataset: " MEMNAME ", Observations in dataset: " _text1 / "Number of variables in data set: &numvars" / "Dataset last modified: " moddate / "This listing generated on: " todayis ; end; %* For each variable, add header information on type, length, and format. ; if first.name then do; length = trim(left(length)) ; put / / / name @38 label ; if type=1 then put / " Type = Numeric, Length = " length @; else put / " Type = Character, Length = " length @; if format ne " " then put +(-1) ", Format = " _form ; else put ; end; put " " _text ; run; %end ; %* if outtype = TEXT. ; %*********** HTML output section. ; %else %if (&outtype = HTML) %then %do ; data &outdsn ; merge main (in = one) cont (in = two); by name; if one=0 or two=0 then put 'Error: Major Linking Problem ' name= one= two=; file "&filepath.&dsname..HTM" noprint ; length _text1-_text3 $50.; if _n_=1 then do; _text1 = trim(left(put(nobs,comma8.0))); temp = datepart(crdate); _text2 = trim(left(put(temp,weekdate37.))) || " " || put(crdate,tod8.); temp = datepart(modate); _text3 = trim(left(put(temp,weekdate37.))) || " " || put(modate,tod8.); moddate = put(temp, mmddyy10.) ; todayis = put(today(), mmddyy10.) ; %* Start HTML with header information. ; put "" / " " / " Codebook For: &study Directory: &dirtype Dataset: " memname "" / "

" / "

" / "Dataset: " MEMNAME ", Observations in dataset: " _text1 / "

" / "Number of variables in data set: &numvars" / "

" / "Dataset last modified: " moddate / "

" / "This listing generated on: " todayis / "

" / "" / "

" / "
" / " " / " " / " " / " " / " " / "
Index Bottom
" / "
" / "

" ; end; %* For each variable, add header information on type, length, and format. ; if first.name then do; length = trim(left(length)) ; put / / / "
" name "" @38 label ;
      if type=1 then put / "   Type = Numeric, Length = " length    @;
      else           put / "   Type = Character, Length = " length  @;
      if format ne " " then put +(-1) ", Format = " _form "";
      else put "";
   end;
   put "   " _text ;
run;

%* Add a footer to each file. ;
data _null_ ;

   file "&filepath.&dsname..HTM" noprint mod;

      put / / " 
" / / "

" / "

" / "
" / " " / " " / " " / " " / " " / "
Index Top
" / "

" / " " / "" / " "; run; %end ; %* if outtype = HTML. ; %*********** Dataset only output section. ; %else %if (&outdsn ^= _NULL_ and &outtype = ) %then %do ; data &outdsn ; merge main (in = one) cont (in = two); by name; if one=0 or two=0 then put 'Error: Major Linking Problem ' name= one= two=; run; %end ; %* if outtype = . ; %mend; %*codebook(libname = BIO, study = CLIENT_X, dsname =, freq = YES, outtype = TEXT, formats = YES, outdsn = _NULL_, dirtype = CREATED, totfreq = 10, firstbk = 1, keep = ); options symbolgen mprint mlogic; %codebook(libname = sashelp, study = CLIENT_X, dsname = class, freq = YES, outtype = TEXT, formats = YES, filepath = %str(C:\GP\Business\DAT\code\), outdsn = class_stats, dirtype = CREATED, totfreq = 50, firstbk = 1, keep = );