Removing all NA rows and/or columns
fsFit[-which(apply(fsFit,1,function(x)all(is.na(x)))),-which(apply(fsFit,2,function(x)all(is.na(x))))]
Monday, April 26, 2010
Wednesday, April 21, 2010
extracting a percentage of data by random by groups
1) Randomly choose 10% of data from each "age" group.
> x <- data.frame(group=sample(1:4,100,TRUE), age=runif(100,4,80))
> tapply(x$age, x$group, function(z) mean(z[sample(seq_along(z), length(z) / 10)]))
2) To split my dataset randomly into 2 parts: a prediction set (with 2/3 of my data) and a validation set (with 1/3 of my data).
> x <- 1:100 # test data
> y <- split(x, sample(1:2, length(x), replace=TRUE, prob=c(1,2)))
3) I would like to randomly divide this data frame in half. how to select those rows that were not selected and assign them to randomsample2
selected<-rep(0,39622)
selected[sample(1:39622,39622/2)]<-1
data$selected<-selected
rm(selected)
or
data$selected<-rbinom(39622,1,.5)
> x <- data.frame(group=sample(1:4,100,TRUE), age=runif(100,4,80))
> tapply(x$age, x$group, function(z) mean(z[sample(seq_along(z), length(z) / 10)]))
2) To split my dataset randomly into 2 parts: a prediction set (with 2/3 of my data) and a validation set (with 1/3 of my data).
> x <- 1:100 # test data
> y <- split(x, sample(1:2, length(x), replace=TRUE, prob=c(1,2)))
3) I would like to randomly divide this data frame in half. how to select those rows that were not selected and assign them to randomsample2
selected<-rep(0,39622)
selected[sample(1:39622,39622/2)]<-1
data$selected<-selected
rm(selected)
or
data$selected<-rbinom(39622,1,.5)
extracting a percentage of data by random by groups
Motivating example:
If I have a dataframe with one of the variables called "age" for
example, and I want to extract a random 10% of the observations from
each "age" group of the entire data frame.
> set.seed(23) # on Windows
> dat <- data.frame(age = factor(sample(1:4, 200, rep = T)), y = runif(200))
> head(dat) # ages are in random order
age y
1 3 0.64275524
2 1 0.56125314
3 2 0.82418228
4 3 0.97050933
5 4 0.02827508
6 2 0.72291636
> with(dat, table(age)) # how many in each age group
age
1 2 3 4
37 55 44 64
> ind <- lapply(split(1:nrow(dat), dat$age),
function(x) sample(x, round(length(x)/10))) # the trick
> ind
$`1`
[1] 135 2 188 133
$`2`
[1] 124 33 140 162 25 13
$`3`
[1] 115 79 27 44
$`4`
[1] 58 129 84 198 72 109
> sample_dat <- dat[sort(unlist(ind)), ] # with indices, select data
> sample_dat
age y
2 1 0.5612531
13 2 0.7339141
25 2 0.9548750
27 3 0.7419931
33 2 0.6965722
44 3 0.5363812
58 4 0.5464051
72 4 0.2785669
79 3 0.6453164
84 4 0.1203811
109 4 0.9154706
115 3 0.2118767
124 2 0.3056171
129 4 0.7635097
133 1 0.6474702
135 1 0.2466226
140 2 0.6292326
162 2 0.5338671
188 1 0.9882631
198 4 0.1983350
>
If I have a dataframe with one of the variables called "age" for
example, and I want to extract a random 10% of the observations from
each "age" group of the entire data frame.
> set.seed(23) # on Windows
> dat <- data.frame(age = factor(sample(1:4, 200, rep = T)), y = runif(200))
> head(dat) # ages are in random order
age y
1 3 0.64275524
2 1 0.56125314
3 2 0.82418228
4 3 0.97050933
5 4 0.02827508
6 2 0.72291636
> with(dat, table(age)) # how many in each age group
age
1 2 3 4
37 55 44 64
> ind <- lapply(split(1:nrow(dat), dat$age),
function(x) sample(x, round(length(x)/10))) # the trick
> ind
$`1`
[1] 135 2 188 133
$`2`
[1] 124 33 140 162 25 13
$`3`
[1] 115 79 27 44
$`4`
[1] 58 129 84 198 72 109
> sample_dat <- dat[sort(unlist(ind)), ] # with indices, select data
> sample_dat
age y
2 1 0.5612531
13 2 0.7339141
25 2 0.9548750
27 3 0.7419931
33 2 0.6965722
44 3 0.5363812
58 4 0.5464051
72 4 0.2785669
79 3 0.6453164
84 4 0.1203811
109 4 0.9154706
115 3 0.2118767
124 2 0.3056171
129 4 0.7635097
133 1 0.6474702
135 1 0.2466226
140 2 0.6292326
162 2 0.5338671
188 1 0.9882631
198 4 0.1983350
>
Sunday, April 18, 2010
Extract rows from data frame based on row names from anotherdata frame
Found in google searches .. can be useful
#Create data and data frames
x=rnorm(5,0,1)
y=rnorm(5,0,1)
z=rnorm(5,0,1)
d1=data.frame(x,y)
d2=data.frame(y,z)
#which variable name in d2 is a variable name in d1?
names(d2[names(d2)%in%names(d1)]) # it's y
#give me the columns of d2 that have variable names
#that are also variable names in d1
d2[names(d2)==names(d2[names(d2)%in%names(d1)])]
#check
d2$y
# continuing with example:
rownames(d1)<- letters[1:5]
rownames(d2)<- letters[3:7]
# and then for rownames of d1 that are also in rownames of d2:
# for the full rows ...
d1[row.names(d1) %in% row.names(d2),]
# or for just the names:
rownames(d1)[row.names(d1) %in% row.names(d2)]
#Create data and data frames
x=rnorm(5,0,1)
y=rnorm(5,0,1)
z=rnorm(5,0,1)
d1=data.frame(x,y)
d2=data.frame(y,z)
#which variable name in d2 is a variable name in d1?
names(d2[names(d2)%in%names(d1)]) # it's y
#give me the columns of d2 that have variable names
#that are also variable names in d1
d2[names(d2)==names(d2[names(d2)%in%names(d1)])]
#check
d2$y
# continuing with example:
rownames(d1)<- letters[1:5]
rownames(d2)<- letters[3:7]
# and then for rownames of d1 that are also in rownames of d2:
# for the full rows ...
d1[row.names(d1) %in% row.names(d2),]
# or for just the names:
rownames(d1)[row.names(d1) %in% row.names(d2)]
Subscribe to:
Posts (Atom)