# KeepNotes blog

Stay hungry, Stay Foolish.

0%

This is reference to the 2.3 section of Data management and 2.4 Date and time variables in <SAS and R: Data Management, Statistical Analysis, and Graphics (second edition)>.

#### Subsetting datasets by rows

``````data class;
set sashelp.class (firstobs=5);
run;``````

``````data class;
set sashelp.class (obs=5);
run;``````

``````data class;
set sashelp.class (firstobs=5 obs=10);
run;``````

#### Subsetting datasets by observations

``````# R code
filter(iris, Species == "setosa")

# SAS code
data class;
set sashelp.class(where=(Sex="M"));
run;``````

#### Splitting a Dataset

``````data female male;
set sashelp.class;
if Sex="F" then output female;
if Sex="M" then output male;
run;``````

#### Drop or keep variables in a dataset

``````# R code
dplyr::select(iris, c("Sepal.Length", "Sepal.Width"))

# SAS code
data class;
set sashelp.class(keep= Name Sex Age);
run;``````

#### Random sample of a dataset

``````# R code
# replace = FALSE
dplyr::sample_n(iris, 10)

# SAS code
/* sample without replacement 简单随机抽样*/
proc surveyselect data=sashelp.class out=outds n=10 method=srs; run;``````

#### Observation number

``````# R code
row.names(iris)

# SAS code
data _null_;
set sashelp.class;
put _n_;
run;``````

#### Keep unique values

• 整个数据集去重
• 基于指定的变量（列）来去重

``````# R code
unique(iris) # All duplicate rows removed
dplyr::distinct(iris, Species, .keep_all = TRUE) # Species rows removed

# SAS code
/*Romove duplicated value by all variables*/
proc sort data=sashelp.retail out=retail_without_duplicated_values nodupkey;
by _all_;
run;
/*Remove duplicatd values by Year variable*/
proc sort data=sashelp.retail out=retail_with_unique_value nodupkey;
by Year;
run;``````

#### Identify duplicated values

``````# R code

/*Return duplicatd values by Year variable*/
proc sort data=sashelp.retail out=retail_with_duplicated_value nouniquekey;
by Year Month;
run;``````

#### Convert from wide to long format or long to wide format

``````# R code
# wide to long transform
relig_income %>%
pivot_longer(!religion, names_to = "income", values_to = "count")

# long to wide transform
fish_encounters %>%
pivot_wider(names_from = station, values_from = seen, values_fill = 0)

# SAS code
/*proc transpose, the same as pivot for long to wide or wide to long*/
data wide;
input SID \$ Programming State English;
datalines;
S01 98 100 80
S02 84 98 94
S03 89 92 88
;
run;

/*wide to long transform*/
proc transpose data=wide out=long(rename=(_name_=Coursename col1=Score));
var Programming State English;
by SID;
run;

/*long to wide transform*/
proc transpose data=long out=Rewide(drop=_name_);
var Score;
by SID;
id Coursename;
run;``````

#### Concatenate and stack datasets

``````/* one to one, having sample observation*/
data class1(keep=name sex) class2(keep=age height weight);
set sashelp.class;
output class1;
output class2;
run;``````

``````/*Set*/
data class;
set class1;
set class2;
run;

/*Sometime is the same as merge*/
data class;
merge class1 class2;
run;``````

#### Merge datasets

``````# R code
band_members %>% inner_join(band_instruments, by = "name")

# SAS code
/*For correct way is first sort and then merge*/
data class1(keep=name sex) class2(keep=name age height weight);
set sashelp.class;
output class1;
output class2;
run;
proc sort data=class1;
by name;
run;
proc sort data=class2;
by name;
run;
data class;
merge class1 class2;
by name;
run;

/*proc sql to implementary left join, right join, inner join*/
data class1(keep=name sex) class2(keep=name age height weight);
set sashelp.class;
output class1;
if _n_ in (1,5,10,15) then output class2;
run;
data class2
set class2;
if name="Janet" then name="Janey";
run;

proc sql;
create table class_left as
select a.*,b.*
from class1 as a left join class2 as b
on a.name=b.name;
quit;``````

#### Select, copy, change name, delete datasets

``````/*select, copy, change name, delete*/
proc datasets;
copy in=sashelp out=work;
select class;
quit;
proc datasets lib=work;
change class=student;
quit;
proc datasets lib=work;
delete student;
quit;

/*delete all datasets*/
proc datasets lib=work kill memtype=data;
run;
/*only save class data*/
proc datasets lib=work;
save class;
run;``````

#### Create date variable

``````# R code
as.Date("2014-04-29")
as.Date(Sys.time())

data dt;
dayvar = input("04/29/2014", mmddyy10.);
dayvar2 = mdy(4, 29, 2014);
todays_date = today();
run;``````

SAS and R: Data Management, Statistical Analysis, and Graphics (second edition)