################################################## ################################ # SECOND R CLASS # COVERS THE FOLLOWING TOPICS: # # (1) creating trasnformations of variables # (2) lists # (3) functions # (4) matrix operations # (5) qqplots # (6) general plots, multiple figures per plot # ################################################## ################################ ###################################################### # (1) Creating transformations of individual variables ###################################################### library(MASS) data(hills) # Ratios hills$ispeed<-hills$time/hills$dist # Logarithms hills$logtime<-log(hills$time) # Log several columns at once crabs dim(crabs) lcrabs<-crabs # Makes a copy lcrabs[,4:8]<-log(crabs[,4:8]) # The comma means "all rows": now all rows in columns 4, 5, 6, 7 and 8 are logged # Powers names(crabs) FL2<-crabs$FL^2 ###################################################### # (2) Lists ###################################################### # A list is a powerful object because it can hold objects of different types: matrices, data.frames, vectors, strings, etc letters # Create different types of objects string<-c("hello","this","is","a","string") matrix1<-matrix(1:20,nrow=4,ncol=5) matrix2<-matrix(1:10,nrow=5,ncol=2) vector1<-(1:20)^2 scalar1<-24 # Put them in a list mylist<-list(string,matrix1,matrix2,vector1,scalar1) dim(mylist) names(mylist) summary(mylist) # Access the different elements: indexing is with double brackets mylist[[1]] mylist[[2]] mylist[[3]] mylist[[4]] mylist[[5]] # We can give names to the elements mylist<-list(str=string,mat1=matrix1,mat2=matrix2,vec=vector1,sca=scalar1) names(mylist) summary(mylist) # Now we can access them with dollar sign mylist$str mylist$mat1 mylist$mat2 mylist$vec mylist$sca # And we can index each element separately mylist$mat1 mylist$mat[1:3,3] # This method does not work, unfortunately mylist[[2]][1:3,4:5] # This method does work ###################################################### # (3) Functions ###################################################### # Arguments to functions can be specified in two different ways: # 1. Arguments may be specified in the same order in which they are given in the # function definition. In this case, the values of the arguments must be supplied in order # 2. Arguments may be specified as name.of.argument=value. In this case, the order in which # the arguments are supplied is irrelevant. Also, the specification of names for the arguments # allows to give default values for the arguments # GENERAL FORM function.name<-function(arguments) { function.body return(output.arguments) } # Example 1: The simplest example # By default, a function return the last value calculated square<-function(x) x^2 square(10) # Add a default value # Function to square numbers square<-function(x=5) { x^2 } square() # Square and divide by 2 square.divide2<-function(x) { x^2 x/2 } square.divide2(3) # Example 2 std.dev<- function(x) sqrt(var(x)) # A function that calculates the std dev of x # Alternatively, it can be written in two lines: std.dev<- function(x) { sqrt(var(x)) } x<-rnorm(100,mean=13,sd=3) std.dev(x) # Example 3 # Default values are specified with equal signs in the definition of the arguments of the function # In the example below, if mu is not given as an argument its default value is zero t.test.p<- function(x,mu=0) { n<-length(x) t<-sqrt(n)*(mean(x)-mu)/std.dev(x) 2*(1-pt(abs(t),n-1)) # last value is returned } # Example 3 now with a list # Often we want many values returned, not only one: so we use a list to return as many objects as we want t.stat<- function(x,mu=0) { # This sets mu=0 by default, i.e. if we don't specify the value of mu, it uses mu=0 n<-length(x) t<-sqrt(n)*(mean(x)-mu)/std.dev(x) list(t.stat=t,p.value=2*(1-pt(abs(t),n-1))) # All values in list are returned } # When we give names to the elements of the list, we can access them with "$" # after we saved the output of the function in some object x<-rnorm(100,mean=0,sd=3) result<-t.stat(x) names(result) result$t.stat result$p.value ## But look what happens if after defining the list of objects we want we do some calculation t.stat<- function(x,mu=0) { # This sets mu=0 by default, i.e. if we don't specify the value of mu, it uses mu=0 n<-length(x) t<-sqrt(n)*(mean(x)-mu)/std.dev(x) list(t.stat=t,p.value=2*(1-pt(abs(t),n-1))) # All values in list are returned 2+2 } result<-t.stat(x) names(result) result ## The results are gone!! # To avoid this, use "return()" t.stat<- function(x,mu=0) { # This sets mu=0 by default, i.e. if we don't specify the value of mu, it uses mu=0 n<-length(x) t<-sqrt(n)*(mean(x)-mu)/std.dev(x) return(list(t.stat=t,p.value=2*(1-pt(abs(t),n-1)))) 2+2 } result<-t.stat(x) names(result) ###################################################### # (4) Matrix operations # A%*%B calculates matrix product # A%*%A calculates inner product A'A # A*B calculates the element-by-element product # crossprod(X) calculates X'X # ncol(A) ==> number of columns of A # nrow(A) ==> number of rows of A # To invert A, use can use solve(A), # Actually, solve(A,B) solves AX=B ==> X=A^(-1)B # So solve(A)solves AX=I ==> X=A^(-1) # or if A is not square use the generalized inverse ginv(A) # det(A) calculates the determinant of A # The tranpose of A is t(A) ###################################################### # Function to multiply two matrices multiply<-function(x,y) { x%*%y dim(x%*%y) } matrix1<-matrix(seq(1:20),nrow=5,ncol=4) matrix1 matrix2<-matrix(seq(1:8),nrow=4,ncol=2) matrix2 multiply(matrix1,matrix2) # But if we want to be able to access both outputs we do multiply<-function(x,y) { product<-x%*%y dimensions<-dim(x%*%y) list(product=product,dimensions=dimensions) } # Or, much more efficiently multiply<-function(x,y) { list(product=x%*%y,dimensions=dim(x%*%y)) } multiply(matrix1,matrix2) funout<-multiply(matrix1,matrix2) funout funout$product funout$dimensions A<-matrix(c(1,1,0,0,2,0,0,3,1),nrow=3,ncol=3) A # Function to multiply a matrix by its tranpose and invert it transpose<-function(x) { tx<-t(x) c1<-t(x)%*%x c2<-crossprod(x) inv1<-solve(x) inv2<-ginv(x) deter<-det(x) list(tran=tx,cross1=c1,cross2=c2,stdinver=inv1,geninver=inv2,det=deter) } output1<-transpose(A) A output1$tran output1$cross1 output1$cross2 output1$stdinver output1$geninver output1$det # Function to multiply a matrix by its tranpose and invert it inverse<-function(x) { inv<-solve(t(x)%*%x) list(inv=inv) } ###################################################### # (5) Graphs # We must open a graphics device that will receive graphical output # By deafault, the graph is sent to your computer screen # postscript(file="graph1.ps") # Starts the graphics device driver for producing PostScript graphics # Saves the graph in a the file "graph1.ps" in the directory currently being used # pdf(file="graph1.pdf") # Starts the graphics device driver for producing PDF graphics # Saves the graph in the file "graph1.pdf" in the directory currently being used # All open graphic devices must be closed using # dev.off() ###################################################### # Histograms x<-rnorm(1000) # Generates random vector y<-rnorm(1000) truehist(c(x,y+4),nbins=25) hist(x,breaks=10,xlab="Generated random numbers",main="This is not a very exciting histogram") # Makes histogram contour(dd<-kde2d(x,y)) # Generates level sets ==> 2D density plot image(dd) # Generates contour with colors # Plots # The default: one figure per graph x<--5:5 y<-x^2 plot(x,y) # But this can be easily changed : we can create multiple figures in one plot # We use "par()" to set graphical parameters par(mfrow=c(2,3)) # Selects a 2x3 array of figures, filled along rows par(mfcol=c(2,3)) # Selects a 2x3 array of figures, filled along columns ## "mfrow=c(2,3)" means MultiFrame Rowise (2x3) layout ## "mfrow=c(2,3)" means MultiFrame Columnwise (2x3) layout # Simple plots changing the paramater "type" par(mfrow=c(3,2)) plot(x,y) plot(x,y,type="l") plot(x,y,type="b") plot(x,y,type="h") plot(x,y,type="o") plot(x,y,type="n") mtext("Different options for the plot parameter type",side=3, outer=T,line=-3) par(mfrow=c(1,1)) # Reset # You can modify many things x<-seq(-5,5,1) y<-x^2 plot(x,y,pch="X",main="Title of graph",sub="Subtitle of the graph",xlab="Name of x", ylab="Name of y",xlim=c(-8,8),type="o",lty=2) ## Plot regression outputs x<-seq(1,100,0.5) # Makes a sequence from 1 to 1000 with 0.5 intervals y<-3*x+rnorm(length(x),mean=-2,sd=10) dum<-data.frame(x,y) rm(x,y) # Removes x,y, and w fm<-lm(y~x,data=dum) # Runs a linear regression summary(fm) # Shows the results attach(dum) # Standard scatter plot plot(x,y) abline(1,3,lty=3, col=2) # Adds true regression line lines(x,fitted(fm)) # Adds fitted regression line abline(fm, col=4) # Adds fitted regression line (another way of doing it) # QQ plot # A quantile versus quantile (Q-Q) plot is a graphical technique for assesing whether two data sets come from populations with a common distribution # A Q-Q plot is a plot of the quantiles of the first data set against the quantiles of the second data set # By a quantile, we mean the fraction (or percent) of points below the given value # A 45-degree reference line is usually also plotted. If the two sets come from a population with the same distribution, the points should fall # approximately along the 45-degree line. # The greater the departure from this reference line, the greater the evidence for the conclusion that the two data sets have come # from populations with different distributions. # qqnorm(x) plots the empirical quantiles of x against the theoretical quantiles of a standard normal # qqline(x) adds a line to a normal Q-Q plot which passes through the first and third quartiles # 'qqplot' produces a QQ plot of two datasets. # When the empirical quantiles are plotted on the y-axis and the theoretical quantiles are plotted on the x-axis # the empirical distribution has heavy tails if the outer parts of the curve are steeper than the middle part plot(fitted(fm),resid(fm),xlab="Fitted values", ylab="Residuals") qqnorm(resid(fm)) qqline(resid(fm)) qqnorm((resid(fm)-mean(resid(fm)))/sd(resid(fm))) abline(0,1,col=2) x<-rnorm(100) qqnorm(x) abline(0,1,col=2) qqline(x,col=4) detach() # Remove data frame from the search path rm(fm,dum) # Enter some data and do some graphs and calculations weight<-c(60,72,57,90,95,72) # Enter fake weight data weight height<-c(1.75, 1.80, 1.65, 1.90, 1.74, 1.91) # Enter fake height data height bmi<-weight/height^2 # Element by element bmi # Calculate mean and standard deviation of bmi bmi.mean<-sum(bmi)/length(bmi) bmi.mean mean(bmi) bmi.std<-sqrt(sum((bmi-bmi.mean)^2)/(length(bmi)-1)) bmi.std sqrt(var(bmi)) plot(height,weight) plot(height,weight,pch=2) # bmi=weigth/height^2 ==> weight=bmi*height^2 # Add a line wiht expected weight for every height that appears in the x-scale hh<-c(1.65, 1.70, 1.75, 1.80, 1.85, 1.90) lines(hh, bmi.mean*hh^2) # *22.5 # Note plot(height,weight) plot(x=height,y=weight) plot(y=weight,x=height) # If we specify the name of the argument, the order is irrelevant x<-rnorm(50) y<-x<-rnorm(50,2,1) plot(x,y,main="Plotting of normal random numbers",sub="Different means, standard deviation one", xlab="Random numbers mean=0 sd=1",ylab="Random numbers mean=2 sd=1") abline(v=0.6,h=0.5) # Two figures in two rows and one column par(mfrow=c(2,1)) # You can add an enter by writing "\n" plot(x,y,main="Plotting of normal random numbers \n Different means, standard deviation one", xlab="Random numbers mean=0 sd=1",ylab="Random numbers mean=2 sd=1") abline(v=0.6,h=0.5,col=2) x<-rnorm(50) hist(x,breaks=25) # Two figures in one row and two columns par(mfrow=c(1,2)) plot(x,y,main="Plotting of normal random numbers \n Different means, standard deviation one", xlab="Random numbers mean=0 sd=1",ylab="Random numbers mean=2 sd=1") abline(v=0.6,h=0.5,col=2) x<-rnorm(50) hist(x,breaks=25) ## One more example of how flexible it is # The sin function x<-seq(0,2*pi,length=21) y<-sin(x) plot(x,y,axes=F,type="b",pch="x",xlab="",ylab="") # Add the horizontal axis (by specifying pos=0 we say that the horizontal axis must be at y=0) axis(1,at=c(0,1,2,pi,4,5,2*pi),labels=c(0,1,2,"Pi",4,5,"2Pi"),pos=0) # Add the vertical axis axis(2,at=c(-1,-0.5,0,0.25,0.5,0.75,1),adj=1) # Add a horizontal lines abline(h=c(-1,-0.5,0.5,1),lty=3) # Add text (adj=0 means the specified (x,y) point is to the left of the text) text(pi,0.1," sin(pi)=0",adj=0) title("The sine function from 0 to 2 pi") plot(x,y,axes=F) box() axis(4)