R语言学习笔记:SQL操作

简介:

虽然R很强大,但如果对SQL非常熟悉,也不能浪费这项技能了,可以用上sqldf包,从example("sqldf")抄了几条用法放在这里,以后可能会用上。

library("tcltk")

a1r <- head(warpbreaks)

a1s <- sqldf("select * from warpbreaks limit 6")

 

a2r <- subset(CO2, grepl("^Qn", Plant))

a2s <- sqldf("select * from CO2 where Plant like 'Qn%'")

  

data(farms, package = "MASS")

a3r <- subset(farms, Manag %in% c("BF", "HF"))

row.names(a3r) <- NULL

a3s <- sqldf("select * from farms where Manag in ('BF', 'HF')")

  

a4r <- subset(warpbreaks, breaks >= 20 & breaks <= 30)

a4s <- sqldf("select * from warpbreaks where breaks between 20 and 30",  row.names = TRUE)

 

a5r <- subset(farms, Mois == 'M1')

a5s <- sqldf("select * from farms where Mois = 'M1'", row.names = TRUE)

  

a6r <- subset(farms, Mois == 'M2')

a6s <- sqldf("select * from farms where Mois = 'M2'", row.names = TRUE)

 

a7r <- rbind(a5r, a6r)

a7s <- sqldf("select * from a5s union all select * from a6s")

row.names(a7r) <- NULL

  

其它例子暂时用不到,就不试了,把example(sqldf)的输出记录在这里。

sqldf> # aggregate - avg conc and uptake by Plant and Type
sqldf> a8r <- aggregate(iris[1:2], iris[5], mean)

sqldf> a8s <- sqldf('select Species, avg("Sepal.Length") `Sepal.Length`, 
sqldf+ avg("Sepal.Width") `Sepal.Width` from iris group by Species')

sqldf> all.equal(a8r, a8s)
[1] TRUE

sqldf> # by - avg conc and total uptake by Plant and Type
sqldf> a9r <- do.call(rbind, by(iris, iris[5], function(x) with(x,
sqldf+ data.frame(Species = Species[1], 
sqldf+ mean.Sepal.Length = mean(Sepal.Length),
sqldf+ mean.Sepal.Width = mean(Sepal.Width),
sqldf+ mean.Sepal.ratio = mean(Sepal.Length/Sepal.Width)))))

sqldf> row.names(a9r) <- NULL

sqldf> a9s <- sqldf('select Species, avg("Sepal.Length") `mean.Sepal.Length`,
sqldf+ avg("Sepal.Width") `mean.Sepal.Width`, 
sqldf+ avg("Sepal.Length"/"Sepal.Width") `mean.Sepal.ratio` from iris
sqldf+ group by Species')

sqldf> all.equal(a9r, a9s)
[1] TRUE

sqldf> # head - top 3 breaks
sqldf> a10r <- head(warpbreaks[order(warpbreaks$breaks, decreasing = TRUE), ], 3)

sqldf> a10s <- sqldf("select * from warpbreaks order by breaks desc limit 3")

sqldf> row.names(a10r) <- NULL

sqldf> identical(a10r, a10s)
[1] TRUE

sqldf> # head - bottom 3 breaks
sqldf> a11r <- head(warpbreaks[order(warpbreaks$breaks), ], 3)

sqldf> a11s <- sqldf("select * from warpbreaks order by breaks limit 3")

sqldf> # attributes(a11r) <- attributes(a11s) <- NULL
sqldf> row.names(a11r) <- NULL

sqldf> identical(a11r, a11s)
[1] TRUE

sqldf> # ave - rows for which v exceeds its group average where g is group
sqldf> DF <- data.frame(g = rep(1:2, each = 5), t = rep(1:5, 2), v = 1:10)

sqldf> a12r <- subset(DF, v > ave(v, g, FUN = mean))

sqldf> Gavg <- sqldf("select g, avg(v) as avg_v from DF group by g")

sqldf> a12s <- sqldf("select DF.g, t, v from DF, Gavg where DF.g = Gavg.g and v > avg_v")

sqldf> row.names(a12r) <- NULL

sqldf> identical(a12r, a12s)
[1] TRUE

sqldf> # same but reduce the two select statements to one using a subquery
sqldf> a13s <- sqldf("select g, t, v 
sqldf+ from DF d1, (select g as g2, avg(v) as avg_v from DF group by g) 
sqldf+ where d1.g = g2 and v > avg_v")

sqldf> identical(a12r, a13s)
[1] TRUE

sqldf> # same but shorten using natural join
sqldf> a14s <- sqldf("select g, t, v 
sqldf+ from DF 
sqldf+ natural join (select g, avg(v) as avg_v from DF group by g) 
sqldf+ where v > avg_v")

sqldf> identical(a12r, a14s)
[1] TRUE

sqldf> # table
sqldf> a15r <- table(warpbreaks$tension, warpbreaks$wool)

sqldf> a15s <- sqldf("select sum(wool = 'A'), sum(wool = 'B') 
sqldf+ from warpbreaks group by tension")

sqldf> all.equal(as.data.frame.matrix(a15r), a15s, check.attributes = FALSE)
[1] TRUE

sqldf> # reshape
sqldf> t.names <- paste("t", unique(as.character(DF$t)), sep = "_")

sqldf> a16r <- reshape(DF, direction = "wide", timevar = "t", idvar = "g", varying = list(t.names))

sqldf> a16s <- sqldf("select 
sqldf+ g, 
sqldf+ sum((t == 1) * v) t_1, 
sqldf+ sum((t == 2) * v) t_2, 
sqldf+ sum((t == 3) * v) t_3, 
sqldf+ sum((t == 4) * v) t_4, 
sqldf+ sum((t == 5) * v) t_5 
sqldf+ from DF group by g")

sqldf> all.equal(a16r, a16s, check.attributes = FALSE)
[1] TRUE

sqldf> # order
sqldf> a17r <- Formaldehyde[order(Formaldehyde$optden, decreasing = TRUE), ]

sqldf> a17s <- sqldf("select * from Formaldehyde order by optden desc")

sqldf> row.names(a17r) <- NULL

sqldf> identical(a17r, a17s)
[1] TRUE

sqldf> # centered moving average of length 7
sqldf> set.seed(1)

sqldf> DF <- data.frame(x = rnorm(15, 1:15))

sqldf> s18 <- sqldf("select a.x x, avg(b.x) movavgx from DF a, DF b 
sqldf+ where a.row_names - b.row_names between -3 and 3 
sqldf+ group by a.row_names having count(*) = 7 
sqldf+ order by a.row_names+0", 
sqldf+ row.names = TRUE)

sqldf> r18 <- data.frame(x = DF[4:12,], movavgx = rowMeans(embed(DF$x, 7)))

sqldf> row.names(r18) <- NULL

sqldf> all.equal(r18, s18)
[1] TRUE

sqldf> # merge. a19r and a19s are same except row order and row names
sqldf> A <- data.frame(a1 = c(1, 2, 1), a2 = c(2, 3, 3), a3 = c(3, 1, 2))

sqldf> B <- data.frame(b1 = 1:2, b2 = 2:1)

sqldf> a19s <- sqldf("select * from A, B")

sqldf> a19r <- merge(A, B)

sqldf> Sort <- function(DF) DF[do.call(order, DF),]

sqldf> all.equal(Sort(a19s), Sort(a19r), check.attributes = FALSE)
[1] TRUE

sqldf> # within Date, of the highest quality records list the one closest 
sqldf> # to noon. Note use of two sql statements in one call to sqldf.
sqldf> 
sqldf> Lines <- "DeployID Date.Time LocationQuality Latitude Longitude
sqldf+ STM05-1 2005/02/28 17:35 Good -35.562 177.158
sqldf+ STM05-1 2005/02/28 19:44 Good -35.487 177.129
sqldf+ STM05-1 2005/02/28 23:01 Unknown -35.399 177.064
sqldf+ STM05-1 2005/03/01 07:28 Unknown -34.978 177.268
sqldf+ STM05-1 2005/03/01 18:06 Poor -34.799 177.027
sqldf+ STM05-1 2005/03/01 18:47 Poor -34.85 177.059
sqldf+ STM05-2 2005/02/28 12:49 Good -35.928 177.328
sqldf+ STM05-2 2005/02/28 21:23 Poor -35.926 177.314
sqldf+ "

sqldf> DF <- read.table(textConnection(Lines), skip = 1, as.is = TRUE,
sqldf+ col.names = c("Id", "Date", "Time", "Quality", "Lat", "Long"))

sqldf> sqldf(c("create temp table DFo as select * from DF order by
sqldf+ Date DESC, Quality DESC,
sqldf+ abs(substr(Time, 1, 2) + substr(Time, 4, 2) /60 - 12) DESC",
sqldf+ "select * from DFo group by Date"))
Id Date Time Quality Lat Long
1 STM05-2 2005/02/28 12:49 Good -35.928 177.328
2 STM05-1 2005/03/01 18:47 Poor -34.850 177.059

sqldf> ## Not run: 
sqldf> ##D 
sqldf> ##D # test of file connections with sqldf
sqldf> ##D 
sqldf> ##D # create test .csv file of just 3 records
sqldf> ##D write.table(head(iris, 3), "iris3.dat", sep = ",", quote = FALSE)
sqldf> ##D 
sqldf> ##D # look at contents of iris3.dat
sqldf> ##D readLines("iris3.dat")
sqldf> ##D 
sqldf> ##D # set up file connection
sqldf> ##D iris3 <- file("iris3.dat")
sqldf> ##D sqldf('select * from iris3 where "Sepal.Width" > 3')
sqldf> ##D 
sqldf> ##D # using a non-default separator
sqldf> ##D # file.format can be an attribute of file object or an arg passed to sqldf
sqldf> ##D write.table(head(iris, 3), "iris3.dat", sep = ";", quote = FALSE)
sqldf> ##D iris3 <- file("iris3.dat")
sqldf> ##D sqldf('select * from iris3 where "Sepal.Width" > 3', file.format = list(sep = ";"))
sqldf> ##D 
sqldf> ##D # same but pass file.format through attribute of file object
sqldf> ##D attr(iris3, "file.format") <- list(sep = ";")
sqldf> ##D sqldf('select * from iris3 where "Sepal.Width" > 3')
sqldf> ##D 
sqldf> ##D # copy file straight to disk without going through R
sqldf> ##D # and then retrieve portion into R 
sqldf> ##D sqldf('select * from iris3 where "Sepal.Width" > 3', dbname = tempfile())
sqldf> ##D 
sqldf> ##D ### same as previous example except it allows multiple queries against
sqldf> ##D ### the database. We use iris3 from before. This time we use an
sqldf> ##D ### in memory SQLite database.
sqldf> ##D 
sqldf> ##D sqldf() # open a connection
sqldf> ##D sqldf('select * from iris3 where "Sepal.Width" > 3')
sqldf> ##D 
sqldf> ##D # At this point we have an iris3 variable in both
sqldf> ##D # the R workspace and in the SQLite database so we need to
sqldf> ##D # explicitly let it know we want the version in the database.
sqldf> ##D # If we were not to do that it would try to use the R version
sqldf> ##D # by default and fail since sqldf would prevent it from 
sqldf> ##D # overwriting the version already in the database to protect
sqldf> ##D # the user from inadvertent errors.
sqldf> ##D sqldf('select * from main.iris3 where "Sepal.Width" > 4')
sqldf> ##D sqldf('select * from main.iris3 where "Sepal_Width" < 4')
sqldf> ##D sqldf() # close connection
sqldf> ##D 
sqldf> ##D ### another way to do this is a mix of sqldf and RSQLite statements
sqldf> ##D ### In that case we need to fetch the connection for use with RSQLite
sqldf> ##D ### and do not have to specifically refer to main since RSQLite can
sqldf> ##D ### only access the database.
sqldf> ##D 
sqldf> ##D con <- sqldf()
sqldf> ##D # this iris3 refers to the R variable and file
sqldf> ##D sqldf('select * from iris3 where "Sepal.Width" > 3')
sqldf> ##D sqldf("select count(*) from iris3")
sqldf> ##D # these iris3 refer to the database table
sqldf> ##D dbGetQuery(con, 'select * from iris3 where "Sepal.Width" > 4')
sqldf> ##D dbGetQuery(con, 'select * from iris3 where "Sepal.Width" < 4')
sqldf> ##D sqldf()
sqldf> ##D 
sqldf> ## End(Not run)

本文转自申龙斌的程序人生博客园博文,原文链接:http://www.cnblogs.com/speeding/p/4090332.html,如需转载请自行联系原作者

http://www.cnblogs.com/speeding/ 

相关文章
|
2月前
|
SQL 关系型数据库 MySQL
在MySQL中,什么是结构化查询语言 (SQL)
【8月更文挑战第20天】在MySQL中,什么是结构化查询语言 (SQL)
40 1
|
3月前
|
SQL 存储 关系型数据库
SQL语言优缺点有哪些?
SQL(Structured Query Language)语言作为数据库管理和操作的标准语言,具有一系列的优点,同时也存在一些缺点。
54 7
|
2月前
|
SQL 关系型数据库 MySQL
|
2月前
|
SQL 存储 大数据
SQL 语言发展史简直太震撼啦!从诞生到现代数据处理,见证一场奇妙的演变之旅,快来感受!
【8月更文挑战第31天】SQL(结构化查询语言)自20世纪70年代由IBM研究员E.F. Codd提出以来,已成为现代数据处理不可或缺的一部分。它最初简化了层次和网状模型中复杂的存储与检索问题,通过基本的SELECT、FROM和WHERE关键字实现了数据查询。80年代,SQL在商业数据库中广泛应用,引入了GROUP BY、HAVING和ORDER BY等功能,增强了数据分析能力。90年代,互联网和企业信息化推动了SQL的进一步优化与扩展,支持分布式数据库和数据仓库等技术。
31 0
|
2月前
|
SQL 存储 NoSQL
从SQL到NoSQL:理解不同数据库类型的选择与应用——深入比较数据模型、扩展性、查询语言、一致性和适用场景,为数据存储提供全面决策指南
【8月更文挑战第31天】在信息技术飞速发展的今天,数据库的选择至关重要。传统的SQL数据库因其稳定的事务性和强大的查询能力被广泛应用,而NoSQL数据库则凭借其灵活性和水平扩展性受到关注。本文对比了两种数据库类型的特点,帮助开发者根据应用场景做出合理选择。SQL数据库遵循关系模型,适合处理结构化数据和复杂查询;NoSQL数据库支持多种数据模型,适用于非结构化或半结构化数据。SQL数据库在一致性方面表现优异,但扩展性较差;NoSQL数据库则设计之初便考虑了水平扩展性。SQL使用成熟的SQL语言,NoSQL的查询语言更为灵活。
42 0
|
2月前
|
SQL 数据可视化 数据挖掘
SQL 在数据分析中简直太牛啦!从数据提取到可视化,带你领略强大数据库语言的神奇魅力!
【8月更文挑战第31天】在数据驱动时代,SQL(Structured Query Language)作为强大的数据库查询语言,在数据分析中扮演着关键角色。它不仅能够高效准确地提取所需数据,还能通过丰富的函数和操作符对数据进行清洗与转换,确保其适用于进一步分析。借助 SQL 的聚合、分组及排序功能,用户可以从多角度深入分析数据,为企业决策提供有力支持。尽管 SQL 本身不支持数据可视化,但其查询结果可轻松导出至 Excel、Python、R 等工具中进行可视化处理,帮助用户更直观地理解数据。掌握 SQL 可显著提升数据分析效率,助力挖掘数据价值。
37 0
|
2月前
|
SQL 关系型数据库 数据库
|
2月前
|
SQL 关系型数据库 数据库
|
3月前
|
SQL 分布式计算 MaxCompute
SQL开发问题之对于ODPS中的UNION操作,执行计划的问题如何解决
SQL开发问题之对于ODPS中的UNION操作,执行计划的问题如何解决
|
2月前
|
SQL Java 数据库连接
下一篇
无影云桌面