User Tools

Site Tools


en:data_preparation_rscript
# Histogram of distribution - symmetrical, right-skewed, left-skewed
png ('types_of_distribution.png', width = 8, height = 8/3, units = 'in', res = 300)
par (mfrow = c(1,3))
normal <- vegan:::decostand (rnorm (1000), 'range')*100
hist (normal, main = list ('symmetrical', cex = 2), xlab = list ('Variable X', cex = 1.5), ylab = list ('Frequency', cex = 1.5), col = 'yellow')
right <- vegan:::decostand (normal^3, 'range')*100
hist (right, main = list ('right-skewed', cex = 2), xlab = list ('Variable X', cex = 1.5), ylab = list ('Frequency', cex = 1.5), col = 'red')
left <- vegan:::decostand (log1p (normal), 'range')*100
hist (left, main = list ('left-skewed', cex = 2), xlab = list ('Variable X', cex = 1.5), ylab = list ('Frequency', cex = 1.5), col = 'lightblue')
dev.off ()
# raw vs log-transformed population data
#Data from here: https://commons.wikimedia.org/wiki/File:Population_vs_area.svg
 
# Population data from Wikipedia
pop <- c(33681000, 1134000, 61811, 5110000, 6600, 3350400, 13388910, 52000, 
         79221000, 107000, 1185000, 40135000, 9863000, 19522000, 15757000, 
         23837000, 25721000, 506000, 2053355, 14027000, 3767000, 2985000, 
         6316000, 45828172, 3476608, 16517532, 6349000, 2719000, 2845000, 
         43739000, 163000, 57000, 1475000, 1500, 33000, 4312067, 23580000, 
         89300, 166649000, 3170000, 179000, 546200, 5743000, 4599000, 21075000, 
         1165040000, 8629900, 2067000, 109000, 39802000, 48333000, 6952000, 
         178000, 71517100, 28150000, 849000, 87000, 3291000, 523000, 172000, 
         30800, 2671000, 65073482, 9998000, 5413548, 9133000, 29165000, 6320000, 
         10000, 84000, 4821137, 15263000, 20000, 8935000, 111000, 513000, 
         11204000, 624000, 3683000, 6619000, 3230100, 10090000, 46143700, 
         791000, 104000, 5336330, 6420000, 230330000, 4422000, 306664000, 
         9276509, 88069000, 23000, 13010000, 800, 7602100, 1288000, 21496700, 
         18498000, 10631800, 48697000, 1400, 801600, 35593, 1409000, 28200000, 
         12534000, 22894000, 32710000, 127580000, 15290000, 191293104, 3000, 
         48841, 10069000, 3454000, 762000, 4579000, 491700, 67000, 86000, 31000, 
         21906000, 20000, 154729000, 13995000, 162221000, 400000, 21809733, 
         70495782, 34895000, 6163000, 10000, 50, 10474600, 62000, 16922000, 
         3982000, 10741000, 63389730, 10033000, 322100, 7008900, 5696000, 
         4382100, 5515287, 92226600, 3572700, 31491578, 2171000, 1611000, 
         98000, 7719100, 104000, 80000, 11206000, 1340341, 3361000, 676000, 
         4224000, 27488000, 10327800, 864000, 65000, 88000, 67000, 44952732, 
         8303000, 33000, 256000, 19625000, 60090400, 697000, 42272000, 29331000, 
         412600, 309000, 520000, 56000, 15000, 28359313, 198000, 7411500, 
         3761646, 12935000, 8356707, 6732000, 12523000, 82062200, 240000, 
         15571506, 38130300, 5073000, 5482000, 30747000, 5900, 23906000, 
         1339000, 319326, 2257300, 10029900, 9671900, 7466000, 109610000, 
         76762112, 2048900, 4839400, 9850000, 676000, 61612300, 11262500, 
         20238000, 4432000, 1950000, 1331115200, 141812991)
 
## Area data from wikipedia.                                                    
area <- c(9984670, 14874, 78, 488100, 122, 65300, 181035, 261, 1104300, 180, 17364, 
          2780400, 1098581, 475442, 274222, 238533, 2149690, 4033, 20273, 108889, 
          51197, 17818, 89342, 505992, 111369, 41543, 406752, 10991, 309500, 
          945087, 964, 2166086, 267668, 260, 1, 270467, 527968, 116, 881912, 
          28748, 2831, 29, 120340, 83600, 322463, 3287263, 86600, 30355, 389, 
          580367, 99678, 143100, 549, 783562, 652090, 18274, 464, 1025520, 28896, 
          539, 61, 1564100, 632760, 26338, 49035, 637657, 1285216, 236800, 21, 455, 
          323802, 118484, 236, 112622, 702, 266000, 109886, 13812, 342000, 56785, 
          29743, 48310, 603500, 741, 747, 338145, 1759540, 1904569, 622984, 
          9629091, 450295, 331689, 151, 1240192, 0, 110879, 2040, 238391, 
          1246700, 92090, 1221037, 12, 9251, 160, 11586, 329847, 196722, 801590, 
          241038, 377915, 1267000, 8514877, 12173, 1393, 245857, 75517, 214969, 
          51100, 2586, 199, 468, 6, 185180, 459, 923768, 283561, 143998, 5765, 
          7692024, 1648195, 2381741, 21041, 26, 5, 78867, 181, 756102, 8870, 30528, 
          513120, 27750, 22966, 1108, 71740, 69700, 43094, 300000, 33851, 446550, 
          824292, 36125, 726, 41284, 344, 572, 1284000, 45228, 176215, 28051, 10400, 
          447400, 163610, 23200, 54, 442, 751, 1138914, 27834, 948, 430, 587041, 
          301318, 38394, 2505813, 147181, 316, 298, 163820, 264, 91, 912050, 800, 
          22072, 6020, 752618, 83871, 462840, 390757, 357022, 12189, 2724900, 
          312685, 117600, 199951, 438317, 102, 120538, 5130, 103000, 64589, 93028, 
          208000, 112492, 1964375, 1002000, 25713, 699, 88361, 2235, 242900, 
          131957, 65610, 56594, 582000, 9639688, 17098242)
 
png ('raw-vs-log-population-area.png', width = 8, height = 4, units = 'in', res = 300, pointsize = 10)
par (mfrow = c(1,2))
plot (pop ~ area, main = list ('Raw data', cex = 2), xlab = list (expression (Area~(km^2)), cex = 1.5), ylab = list ('Population', cex = 1.5), pch = 16)
plot (pop ~ area, main = list ('Log-transformed data', cex = 2), xlab = list (expression (Area~(km^2)), cex = 1.5), ylab = list ('Population', cex = 1.5), log = 'xy', pch = 16)
dev.off ()
# digitalize pig
# library (pixmap)
# p <- read.pnm ('c:\\Users\\Zeleny\\Dropbox\\uceni\\NumEcol\\figures\\pig for transformation.ppm')
# windows ()
# plot.new ()
# plot.window (xlim = c(0, 100), ylim = c(0, 67))
# axis (1)
# axis (2)
# addlogo (p, c(0,100), c(0, 67))
# pig <- locator ()
# pig.df <- as.data.frame (pig)
# plot (pig.df)
# plot (pig.df, type = 'l', lwd = 10, log = 'xy', ann = T)
# #save (pig.df, file = 'c:\\Users\\Zeleny\\Dropbox\\uceni\\NumEcol\\figures\\pix.df.r')
 
# load (file = 'c:\\Users\\Zeleny\\Dropbox\\uceni\\NumEcol\\figures\\pix.df.r')
# pig.df <- pig.df/20
# write.table (pig.df, 'normal.pig.txt', sep = '\t', row.names = F)
 
pig.df <- read.delim ('https://raw.githubusercontent.com/zdealveindy/anadat-r/master/data/normal.pig.txt')
 
exp.pics <- as.matrix (expand.grid (a = c('log (x)', 'x^(1/3)', 'sqrt (x)', 'x', 'x^2', 'x^3', 'e^x'), b = c('log (y)', 'y^(1/3)', 'sqrt (y)', 'y', 'y^2', 'y^3', 'e^y')))
pig.df$'log (x)' <- log (pig.df$x)
pig.df$'x^(1/3)' <- pig.df$x^(1/3)
pig.df$'sqrt (x)' <- sqrt (pig.df$x)
pig.df$'x^2' <- pig.df$x^2
pig.df$'x^3' <- pig.df$x^3
pig.df$'e^x' <- exp (pig.df$x)
pig.df$'log (y)' <- log (pig.df$y)
pig.df$'y^(1/3)' <- pig.df$y^(1/3)
pig.df$'sqrt (y)' <- sqrt (pig.df$y)
pig.df$'y^2' <- pig.df$y^2
pig.df$'e^y' <- exp (pig.df$y)
pig.df$'y^3' <- pig.df$y^3
 
right_skewed_pig <- pig.df[, c('e^x', 'y')]
names (right_skewed_pig) <- c('x', 'y')
write.table (right_skewed_pig, 'right-skewed.pig.txt', row.names = F, sep = '\t')
 
mat <- matrix (0, ncol = 8, nrow = 8)
mat [2:8, 2:8] <- matrix (1:49, ncol = 7, byrow = T)
mat [1, 2:8] <- 50:56
mat [2:8, 1] <- 57:63
 
png ('trans.pig.png', width = 8, height = 5.44, res = 300, units = 'in', pointsize = 4)
par (mar = c(2,2,2,2))
layout (mat, widths = c(2, rep (4, 7)), height = c(2, rep (4, 7) ))
#apply (as.matrix (exp.pics), 1, FUN = function (pic) plot (x = pig.df[,pic[1]], y = pig.df[,pic[2]], axes = F, ann = F, type = 'l', lwd = 5))
for (i in seq (1, 49))
{
  plot (x = pig.df[,exp.pics[i, 1]], y = pig.df[,exp.pics[i, 2]], axes = F, ann = F, type = 'l', lwd = 2)
  if (i==25)
  {
    box ()
    axis (1, cex.axis = 2, tick = F)
    axis (2, cex.axis = 2, las = 1, tick = F)
  }
}
 
lapply (rev (c('log (x)', 'sqrt (y, 3)', 'sqrt (x)', 'x', 'x^2', 'x^3', 'e^x')), FUN = function (x) {plot.new (); plot.window (xlim = c(0,1), ylim = c(0,1)); text (0.5,0.5,labels = parse (text = x), cex = 4)})
lapply (rev (c('log (y)', 'sqrt (y, 3)', 'sqrt (y)', 'y', 'y^2', 'y^3', 'e^y')), FUN = function (x) {plot.new (); plot.window (xlim = c(0,1), ylim = c(0,1)); text (0.5,0.5,labels = parse (text = x), cex = 4)})
dev.off ()
en/data_preparation_rscript.txt · Last modified: 2019/02/10 20:26 by David Zelený