x cd /pub/data/pl942000/Tools; %let pgm=cnvt1state; filename pgm "&pgm..sas"; *--You can delete this line-; *----Read the 2000 Census PL94-171 data files as released by the Bureau of the Census, March, 2001. Coded by John Blodgett, OSEDA, U. of Missouri, Columbia, MO. Under contract with the Missouri Census Data Center. -----*; *---Update history: 4.21.01: assignment of geocode variable for sumlev 610 and 620 fixed. Blanks removed from geocode. ---*; libname pl942000 '/pub/data/pl942000'; *--This is the directory where the output data sets are saved--; %let maxobs=max; *--assign parm a value of max to do a complete run--; %macro slfilter; %*--this macro is invoked twice below (Step 2): when reading both data and geoheaders file. It specifies which geographic summary levels to drop-----; if sumlev in ('710','720','730') then delete; *--we can easily sum these from the 750 levels if needed; %mend slfilter; %macro concatd(g1,g2,g3,g4,g5,g6,g7); %*--utility macro to return a SAS expression that will be the concatenation of specified char variables separated by dashes-; %local dash gc; %let dash=%str(||'-'||); %*--we build the value of local variable gc which is then "returned" as the result of invoking the macro-; %let gc=&g1; %if &g2= %then %goto rtrn; %let gc=&gc&dash&g2; %if &g3= %then %goto rtrn; %let gc=&gc&dash&g3; %if &g4= %then %goto rtrn; %let gc=&gc&dash&g4; %if &g5= %then %goto rtrn; %let gc=&gc&dash&g5; %if &g6= %then %goto rtrn; %let gc=&gc&dash&g6; %if &g7= %then %goto rtrn; %let gc=&gc&dash&g7; %rtrn: &gc %mend concatd; %macro doit(state, stab); *--this is the macro that will be invoked once for each state converted using the dostates driver macro--; %let stabu=%upcase(&stab); *--upper case: used to verify that the file we are reading is correct-; %let stfips=&state; *--FIPS state code--; title "Convert 2000 PL94-171 Files for &statenm"; options ls=155 ps=59 nodate nocenter ; options compress=no; filename in1 "/pub/data/pl942000/&stab.00001.upl"; filename in2 "/pub/data/pl942000/&stab.00002.upl"; filename ingeo "/pub/data/pl942000/&stab.geo.upl"; *-------------Step 1: Read the comma-delimited pl94 files and create the "detail" pl94 data set------------*; *------------------------------------------------------------------------------------------------- The key to understanding this data step (and the data set that it creates) is to look at the keep statement at the bottom: keep state geo_id race_id hispanic voteage persons: We output 1 observation for each geographic area / fully-qualified racial category / hispanic origin category (1=yes,0=no) / voting age category (1=yes, 0=no) and a single "fact" variable, the number of persons. We only output non-zero cells. This looks more like microdata than a summary file. It is a lot smaller than what we would have if we stored the 280+ cell counts from the input files. The key to having something useful here, of course, is to build a system that can easily access these raw cell counts and produce quick summarizations. Each output observation is only 21 bytes long. --------------------------------------------------------------------------------------------* ; data detail; retain state "&state"; *--assign FIPS state code as constant--; length geo_id 5 race_id hispanic voteage 3 persons 5; *---We read the dual input files in the following 2 parallel code sections The code for reading in1 and in2 are nearly identical in form. We just read different files into different variables.----; infile in1 lrecl=1024 obs=&maxobs missover dsd; *<---------------Read input file 1----------------; input _first14 $char14. recid 7. +1 PL1i1-PL1i71 PL2i1-PL2i73; if _first14 ne "uPL,&stabu.,000,01," then do; _bad14+1; put _first14=; if _bad14 ge 25 then abort 100; *--Just a precaution --; end; infile in2 lrecl=1024 obs=&maxobs missover dsd; *<---------------Read input file 2----------------; input _first14 $char14. recid2 7. +1 PL3i1-PL3i71 PL4i1-PL4i73; if _first14 ne "uPL,&stabu.,000,02," then do; _bad14+1; put _first14=; if _bad14 ge 25 then abort 101; end; infile ingeo lrecl=512 obs=&maxobs; input @9 sumlev $3.; *--geographic summary level from the geo headers file. Can be used by slfilter macro--; *drop sumlev; *--we do not need this since we have a KEEP stmt at bottom of step. But just to emphasize... ; if recid ne recid2 or recid ne _n_ then do; *<---------------Verify file synchronization------; file log; put //'************Data Problem: The recid fields on the 2 input files do not match '/ 'each other or the SAS cycle counter (_n_): ' recid= recid2= _n_= ; list; abort 500; end; %slfilter *<-----Invoke the macro that will filter based on value of sumlev-------------------*; geo_id+1; *<---Assign the geographic key sequentially--; *--These arrays correspond to the table structure of the input file---; array PL1 (71) PL1i1-PL1i71; array PL2 (73) PL2i1-PL2i73; array PL3 (71) PL3i1-PL3i71; array PL4 (73) PL4i1-PL4i73; *--The ptr array has 63 entries corresponding to the 63 race_id values. The value of ptr(race_id) is the relative position of the count for that race group in the input PL1 array (table). Note that only 63 out 0f 71 values are used. The other 8 entries in the PL1 table are subtotals based on number of races. The other 3 tables (PL2, PL3, PL4) are similar except for 2 extra hispanic subtotals at the beginning of tables PL2 and PL4.--; array ptr(63) _temporary_ ( 3, 4,11, 5,12,16,27,6, 13,17,28,20,31,37,48,7, 14,18,29,21,32,38,49,23, 34,40,51,43,54,58,64,8, 15,19,30,22,33,39,50,24, 35,41,52,44,55,59,65,25, 36,42,53,45,56,60,66,46, 57,61,67,62,68,69,71); *---In these nested loops we process each of the 63 possible race combinations. (We do not output anything for the 64th case -- all race flags = 0.) We output up to 4 observations for each race combination based on the 4 possible combinations of hispanic/voting age. See further comments below----; do white=1,0; do black=1,0; do indian=1,0; do asian=1,0; do HawnPI=1,0; do other=1,0; *<========================assign the single race_id key using the race categories as binary digits--; race_id=white + 2* black + 4*indian + 8*asian + 16*hawnpi + 32*other; nraces=sum(of white--other); if nraces=0 then leave; *--no data for people of "no race"--; *--All PL tables have the 63-category race detail with further detail as follows: PL1: totals PL2: Non-hispanic PL3: 18+ (Voting age) PL4: 18+ and Non-hispanic. ---*; *--We output observations based on 3 basic dimensions: race, hispanic and voting age. The race dimension is race_id. Here we need to generate the 4 mutually exclusive combinations of hispanic and voting age (both are yes/no categories). The input data has 4 categories from which we can derive the 4 categories we need. ---; _plx_= ptr(race_id); *--The ptr table entry tells us which entry in the PL tables correspond to values of race_id.-; p1= PL1(_plx_); p2=PL2(_plx_+2); p3=PL3(_plx_); p4=PL4(_plx_+2); hispanic=1; voteage=1; persons= p3 - p4; link obsout; hispanic=1; voteage=0; persons= p1 - p2 - (p3 - p4); link obsout; hispanic=0; voteage=1; persons= p4; link obsout; hispanic=0; voteage=0; persons= p2 - p4; link obsout; end; end; end; end; end; end; return; obsout: if persons > 0 then output; *<---we save a lot of space by NOT KEEPING EMPTY (zero pop) cells--; return; *----In the Dimensional Data Warehouse model this data set is a "Fact Table". Persons is the single fact, all the other variables are dimensions.----*; keep state geo_id race_id hispanic voteage persons; label geo_id='Geographic key' race_id='Race category key' hispanic='Hispanic Origin?' voteage='Aged 18 or over?'; run; filename in1 clear; filename in2 clear; proc sort data=detail out=pl942000.&stab.detail(label='Each obs is a geo-race-hispanic-voting age cell' index=(geo_id)); by geo_id race_id hispanic voteage; run; *------------Step 2: Read the geography headers file and create the permanent &stab.geos data set-------*; data pl942000.&stab.geos(compress=yes sortedby=geo_id); length geo_id 5 GeoCode $32 SumLev $3 AreaName $90; *--establish variable order -- these go first--; retain State "&state" Stab "&stab"; length County $5 Tract $7; *--and then these... -; infile ingeo lrecl=512 obs=&maxobs; input @9 SumLev $3. @19 LogRecNo 7. @26 Region $1. Division $1. StateCe $2. State $2. Cnty $3. CntySC $2. @37 CouSubFP $5. CouSubCC $2. CouSubSC $2. PlaceFP $5. PlaceCC $2. PlaceDC $1. PlaceSC $2. @56 TractIn $6. BG $1. Block $4. @69 ConCit $5. @107 MSACMSA $4. MASC $2. CMSA2 $2. MACCI $1. PMSA $4. NECMA $4. @136 UrbanRur $1. cd106 $2. +6 (sldu sldl)($char3.) vtd $char6. vtdi $1. @158 ZCTA3 $3. ZIP $5. @173 (AreaLand AreaWatr)(14.) AreaName $90. FuncStat $1. gcuni $1. @293 Pop100 9. +9 IntPtLat 9.6 IntPtLon 10.6 LSADC $1. @332 PartFlag $1. (SDElm SDSec SDUni)($5.) TAZ $6. UGA $5. PUMA5 $5. PUMA1 $5.; %slfilter *<-----Invoke the macro that will filter based on value of sumlev-------------------*; geo_id+1; if cnty ne ' ' then County=state||cnty; *--edit the tract so that it is in xxxx.xx format with leading and trailing zeroes--; if TractIn ne ' ' then Tract=translate( substr(TractIn,1,4)||'.'||substr(TractIn,5,2) ,'0',' '); drop tractin; select(SumLev); *---Assign the standardized Geographic Code to uniquely ID the geographic area--; when('750') geocode=%concatd(county,vtd,tract,block); when('740') geocode=%concatd(county,vtd,cousubfp,placefp,tract,bg); when('700') geocode=%concatd(county,vtd); when('610') geocode=%concatd(state,sldu); *--fixed this bug 4-21-01. We had 610 and 620 reversed-; when('620') geocode=%concatd(state,sldl); when('500') geocode=%concatd(state,cd106); when('172') geocode=%concatd(state,concit,placefp); when('170') geocode=%concatd(state,concit); when('160') geocode=%concatd(state,placefp); when('155') do; geocode=%concatd(county,placefp); length _arg $7; _arg=state||placefp; areaname=put(_arg,$fplace.); if areaname=_arg then areaname='Place '||placefp; end; when('140') geocode=%concatd(county,tract); when('060') geocode=%concatd(county,cousubfp); when('050') geocode=county; when('040') geocode=state; otherwise do; *<====We have not yet done code for the various Alaskan and Indian geo levels--; if '280' le SumLev le '285' then geocode=' '; else put '****Unexpected SumLev code encountered: No value assigned to geocode: ' SumLev= areaname=; end; end; *--select group--; geocode=compress(geocode,' '); *--remove any blanks from geocode-----------------(mod 4.21.01)---; *--Create land and total area values in square miles. --; LandSQMI=AreaLand/2589988; AreaSQMI=LandSQMI + (AreaWatr/2589988); format LandSQMI AreaSQMI 9.2; label SumLev='Geographic Summary Level'; label cnty='County code'; label LandSQMI='Land Area Sq Mls' AreaSQMI='Total Area Sq Mls' AreaLand='Land Area Sq Meters' AreaWatr='Water Area Sq Meters'; label cd106='Cong District - 106th (1998)' sldu='State Leg District Upper Chbr' sldl='State Leg District Lower Chbr'; label ZIP='ZIP Census Tabulation Area'; run; filename ingeo clear; proc datasets ddname=pl942000 nolist; *--create some indices--; modify &stab.geos; index create geo_id/ unique; index create SumLev; index create geocode; quit; *-------------------Step 3: Create the more traditional summary data set/view by aggregating the details set------------*; *--Note that we create an all-cells-0 summary for geo areas with 0 population--; data pl942000.&stab.sumsng(label='Summary data - 1 obs per geo area - no geocodes' sortedby=geo_id); merge pl942000.&stab.geos(keep=geo_id Pop100 areaname geocode in=ing) pl942000.&stab.detail(in=ind); by geo_id; if not (ing and ind) and Pop100 > 0 then do; put '**Data missing for geo or detail set: ' _n_= geo_id= areaname= geocode= ing= ind=; _nomatch+1; if _nomatch ge 100 then abort abend; drop _nomatch areaname geocode; delete; end; length default=5; *--following retain stmt establishes variable order which we take advantage of in the array stmts-; retain TotPop White1 White2 Black1 Black2 Indian1 Indian2 Asian1 Asian2 HawnPI1 HawnPI2 Other1 Other2 Whitenh1 Whitenh2 HispPop Over18 WhOvr181 WhOvr182 BlOvr181 BlOvr182 InOvr181 InOvr182 AsOvr181 AsOvr182 HaOvr181 HaOvr182 OtOvr181 OtOvr182 WNOvr181 WNOvr182 HisOvr18; retain MultRace MROvr18; *----For each of the basic 6 race categories we keep a count of persons reporting that race alone and persons reporting that race alone or in combination with other races (i.e. who are at least partly that race). We also do this for the white/non-Hispanic pop. And, we repeat all this for the voting age (Ovr18) pop. ---; label HispPop='Hispanic Population' White1='White alone' White2='White alone or in combination' Black1='Black alone' Black2='Black alone or in combination' Indian1='Amer Indian alone' Indian2='Amer Indian alone or in combination' Asian1='Asian alone' Asian2='Asian alone or in combination' HawnPI1='Hawaaian or PI alone' HawnPI2='Hawaaian or PI alone or in combination' Other1='Some Other race alone' Other2='Some other race alone or in combination' Whitenh1='White alone, Non Hispanic' Whitenh2='White alone or in combination, Non Hispanic'; label WhOvr181='White alone, over 18' WhOvr182='White alone or in combination, over 18'; label BlOvr181='Black alone, over 18' BlOvr182='Black alone or in combination, over 18'; label InOvr181='AIAN alone, over 18' InOvr182='AIAN alone or in combination, over 18'; label AsOvr181='Asian alone, over 18' AsOvr182='Asian alone or in combination, over 18'; label HaOvr181='Hawaaian or PI alone, over 18' HaOvr182='Hawaiian or PI alone or in combination, over 18'; label OtOvr181='Other race alone, over 18' OtOvr182='Other race alone or in combination, over 18'; label WNOvr181='White Non-hisp alone, over 18' WNOvr182='White Non-hisp alone or in combination, over 18'; label HisOvr18='Hispanic Pop Over 18'; label MultRace='Multi Racial' MROvr18='Multi Racial Over 18'; *--The reatin statement above specifies the order of the variables in the PDV and allows us to take these "shortcuts" in listing the variables that make up the arrays.--; array rsums(7,2) white1--Whitenh2; *--6 race cats + White/NH are rows, min and max are the cols here-; array Ovr18sums(7,2) WhOvr181--WNOvr182; *--same idea as rsums but now for the over18 subpop-; array tsums(6) totpop over18 hisppop HisOvr18 MultRace MROvr18; if first.geo_id then do; do i=1 to 7; rsums(i,1)=0; rsums(i,2)=0; Ovr18sums(i,1)=0; Ovr18sums(i,2)=0; end; do i=1 to dim(tsums); tsums(i)=0; end; if Pop100=0 then goto lastgeo; end; *--first.geo_id, start of a new geographic area--; array flags(63,7) _temporary_; array sflags(7) white black indian asian hawnpi other nraces; if _n_=1 then do _i_=1 to 63; *--load the race_ids data set into a 63 x 7 temporary array-; set pl942000.race_ids (drop=RaceLabl rename=(race_id=nck) firstobs=2); if _i_ ne nck then abort abend; *--should NOT happen!--; do j=1 to 7; flags(_i_,j)=sflags(j); end; end; nraces=flags(race_id,7); totpop + persons; if hispanic then hisppop + persons; if voteage then over18 + persons; if hispanic and voteage then HisOvr18 + persons; if nraces gt 1 then MultRace + persons; if nraces gt 1 and voteage then MROvr18 + persons; do i=1 to 6; *--loop over 6 race categories-; sflags(i)=flags(race_id,i); if sflags(i) then do; *--the following 2 statements do the major work of this step. It sums the counts to create the 12 white1, white2, black1, black2, etc. variables.----*; if nraces=1 then rsums(i,1) + persons; rsums(i,2) + persons; *--note that nraces does not matter here. For example, when i=2 we get here if and only if sflags(2)=1, which means the race_id code indicates the Black box was checked. rsums(2,2) is Black2. --; if voteage then do; if nraces=1 then Ovr18sums(i,1) + persons; Ovr18sums(i,2) + persons; end; if i=1 and not hispanic then do; *--Count white non-hispanics--; if nraces=1 then whitenh1 + persons; whitenh2 + persons; if voteage then do; *--White non-hispanics over 18-; if nraces=1 then WNOvr181 + persons; WNOvr182 + persons; end; end; end; end; *--do i=1 to 6 loop; lastgeo: if last.geo_id then do; if totpop ne pop100 then do; _nbad+1; if _nbad le 10 then put '**Pop Counts Off: ' geo_id= areaname= geocode= totpop= pop100=; if _nbad=10 then put '***No more of these cases will be flagged***'; end; drop _nbad; output; end; drop Pop100; drop race_id voteage hispanic; drop white--other nraces persons i j _i_ nck; *--none of these make sense at the geographic totals level-; run; *---Create a view that merges the geographic data with the pop counts------; proc sql; create view pl942000.&stab.sums(label='Pop Counts with Geographic codes') as select * from &stab.geos g , &stab.sumsng d where d.geo_id = g.geo_id; describe view pl942000.&stab.sums; quit; proc print data=pl942000.&stab.sums( obs=50); id geo_id; title2 "First 50 Obs from the pl942000.&stab.sums Data Set (View)"; run; %mend doit; options mprint; *dostates(20); *dostates(17); %dostates(29); run; %include sascode(notify); *--You can delete this line--;