温馨提示×

温馨提示×

您好,登录后才能下订单哦!

密码登录×
登录注册×
其他方式登录
点击 登录注册 即表示同意《亿速云用户服务条款》

proc sql语句在数据清洗中的运用

发布时间:2021-08-30 15:19:47 来源:亿速云 阅读:140 作者:chen 栏目:数据库

本篇内容介绍了“proc sql语句在数据清洗中的运用”的有关知识,在实际案例的操作过程中,不少人都会遇到这样的困境,接下来就让小编带领大家学习一下如何处理这些情况吧!希望大家仔细阅读,能够学有所成!

libname clean "c:/books/clean";                                           /*定义永久性数据库*/

*新建一个样本数据one;
data one;
   input X Y Z;
datalines;
1 2 3
101 202 303
44 55 66
444 555 666
;
title "Values of X from data set ONE where X is greater than 100";
/*有条件的选择观测值*/

proc sql;
   select X
   from one
   where X gt 100;               
quit;

*Program 8-2;
***检查无效的字符型的数值;
title "Checking for Invalid Character Data";
proc sql;
   select Patno,
          Gender,
          DX,
          AE
   from clean.patients
   where Gender not in ('M','F',' ')           or
         notdigit(trim(DX))and not missing(DX) or
         AE not in ('0','1',' ');
quit;

*检查无效的数值型的数值;
title "Checking for out-of-range numeric values";
proc sql;
   select Patno,
          HR,
          SBP,
          DBP
   from clean.patients
   where HR  not between 40 and 100 and not missing(HR)    or
         SBP not between 80 and 200 and not missing(SBP)   or
         DBP not between 60 and 120 and not missing(DBP);
quit;

*基于标准差利用简单的算法来检查数值;
title "Data values beyond two standard deviations";
proc sql;
   select Patno,
          SBP
   from clean.patients
   having SBP not between mean(SBP) - 2 * std(SBP) and
      mean(SBP) + 2 * std(SBP)                     and
      SBP is not missing;
quit;

*检查缺失值;
options linesize=84;
title "Observations with missing values";
proc sql;
   select *
   from clean.patients
   where Patno   is missing or
         Gender  is missing or
         Visit   is missing or
         HR      is missing or
         SBP     is missing or
         DBP     is missing or
         DX      is missing or
         AE      is missing;
quit;

*检查日期;
title "Dates before June 1, 1998 or after October 15, 1999";
proc sql;
   select Patno,
          Visit
   from clean.patients
   where Visit not between '01jun1998'd and '15oct1999'd and
         Visit is not missing;
quit;

*检查重复值;
title "Duplicate Patient Numbers";
proc sql;
   select Patno,
          Visit
      from clean.patients
      group by Patno
      having count(Patno) gt 1;
quit;

*识别对应多个观察值的变量;
title "Listing of patients who do not have two visits";
proc sql;
   select Patno,
          Visit
      from clean.patients2
      group by Patno
      having count(Patno) ne 2;
quit;

*检查两个文件中对应要求的序列号ID;
data one;
   input Patno X Y;
datalines;
1 69 79
2 56 .
3 66 99
5 98 87
12 13 14
;
data two;
   input Patno Z;
datalines;
1 56
3 67
4 88
5 98
13 99
;

*两个文件都不含的ID;
title "Patient numbers not in both files";
proc sql;
   select One.patno as ID_one,
          Two.patno as ID_two
   from one full join two
   on One.patno eq Two.patno
   where One.patno is missing or Two.patno is missing;
quit;

“proc sql语句在数据清洗中的运用”的内容就介绍到这里了,感谢大家的阅读。如果想了解更多行业相关的知识可以关注亿速云网站,小编将为大家输出更多高质量的实用文章!

向AI问一下细节

免责声明:本站发布的内容(图片、视频和文字)以原创、转载和分享为主,文章观点不代表本网站立场,如果涉及侵权请联系站长邮箱:is@yisu.com进行举报,并提供相关证据,一经查实,将立刻删除涉嫌侵权内容。

sql
AI