[SparkR]

简介: words
words <- flatMap(data, function(line) {
  strsplit(line, " ")[[1]]
})
wordCount <- lapply(words, function(word) {
  list(word, 1L)
})
counts <- reduceBykey(wordCount, "+", 2L)
output <- collect(counts)
for (wordCount in output) {
  cat(wordCount[[1]], ": ", wordCount[[2]], "\n")
}
#dataframe.R
library(SparkR)

# Initialize SparkContext and SQLContext
sc <- sparkR.init(appName="SparkR-DataFrame-example")
sqlContext <- sparkRSQL.init(sc)

hadoop fs -rm /user/yyl/alldata.csv
Sys.setenv('SPARKR_SUBMIT_ARGS'='"--packages" "com.databricks:spark-csv_2.10:1.0.3" "sparkr-shell"')
# Create a DataFrame from a JSON file
path <- file.path("hdfs://hadoop-namenode1:8020/user/yyl/alldata.csv")
peopleDF <- read.text(sqlContext, path)
printSchema(peopleDF)

# Register this DataFrame as a table.
registerTempTable(peopleDF, "people")

# SQL statements can be run by using the sql methods provided by sqlContext
teenagers <- sql(sqlContext, "SELECT name FROM people WHERE age >= 13 AND age <= 19")

# Call collect to get a local data.frame
teenagersLocalDF <- collect(teenagers)

# Print the teenagers in our dataset 
print(teenagersLocalDF)
#data-manipulation.R
library(SparkR)
hadoop dfs -copyFromLocal /opt/cloudera/parcels/spark-1.6.2-bin-cdh5/data/hoho /user/yyl
args <- commandArgs(trailing = TRUE)
# Provides access to a copy of the command line arguments supplied when this R session was invoked.

if (length(args) != 1) {
  print("Usage: data-manipulation.R <path-to-flights.csv")
  print("The data can be downloaded from: http://s3-us-west-2.amazonaws.com/sparkr-data/flights.csv ")
  q("no")
}

## Initialize SparkContext
sc <- sparkR.init(appName = "SparkR-data-manipulation-example")

## Initialize SQLContext
sqlContext <- sparkRSQL.init(sc)

flightsCsvPath <- args[[1]]

# Create a local R dataframe
flights_df <- read.csv(flightsCsvPath, header = TRUE)
flights_df$date <- as.Date(flights_df$date)

## Filter flights whose destination is San Francisco and write to a local data frame
SFO_df <- flights_df[flights_df$dest == "SFO", ] 

# Convert the local data frame into a SparkR DataFrame
SFO_DF <- createDataFrame(sqlContext, SFO_df)

#  Directly create a SparkR DataFrame from the source data
flightsDF <- read.df(sqlContext, flightsCsvPath, source = "com.databricks.spark.csv", header = "true")

# Print the schema of this Spark DataFrame
printSchema(flightsDF)

# Cache the DataFrame
cache(flightsDF)

# Print the first 6 rows of the DataFrame
showDF(flightsDF, numRows = 6) ## Or
head(flightsDF)

# Show the column names in the DataFrame
columns(flightsDF)

# Show the number of rows in the DataFrame
count(flightsDF)

# Select specific columns
destDF <- select(flightsDF, "dest", "cancelled")

# Using SQL to select columns of data
# First, register the flights DataFrame as a table
registerTempTable(flightsDF, "flightsTable")
destDF <- sql(sqlContext, "SELECT dest, cancelled FROM flightsTable")

# Use collect to create a local R data frame
local_df <- collect(destDF)

# Print the newly created local data frame
head(local_df)

# Filter flights whose destination is JFK
jfkDF <- filter(flightsDF, "dest = \"JFK\"") ##OR
jfkDF <- filter(flightsDF, flightsDF$dest == "JFK")

# If the magrittr library is available, we can use it to
# chain data frame operations
if("magrittr" %in% rownames(installed.packages())) {
  library(magrittr)

  # Group the flights by date and then find the average daily delay
  # Write the result into a DataFrame
  groupBy(flightsDF, flightsDF$date) %>%
    summarize(avg(flightsDF$dep_delay), avg(flightsDF$arr_delay)) -> dailyDelayDF

  # Print the computed data frame
  head(dailyDelayDF)
}

# Stop the SparkContext now
sparkR.stop()
目录
相关文章
|
SQL 分布式计算 HIVE
SparkR
1. sparkR的简介 SparkR是一个R语言包,它提供了轻量级的方式使得可以在R语言中使用Apache Spark。
1562 0
[Papers]NSE, $u$, Lorentz space [Bosia-Pata-Robinson, JMFM, 2014]
$$\bex \bbu\in L^p(0,T;L^{q,\infty}),\quad \frac{2}{p}+\frac{3}{q}=1,\quad 3
785 0
[Papers]NSE, $u$, Lorentz space [Bjorland-Vasseur, JMFM, 2011]
$$\bex \int_0^T\frac{\sen{\bbu}_{L^{q,\infty}}^p}{\ve+\ln \sex{e+\sen{\bbu}_{L^\infty}}}\rd s
640 0
[Papers]NSE, $u$, Lorentz space [Sohr, JEE, 2001]
$$\bex \bbu\in L^{p,r}(0,T;L^{q,\infty}(\bbR^3)),\quad\frac{2}{p}+\frac{3}{q}=1,\quad 3
1040 0
|
Python
[Papers]NSE, $\pi$, Lorentz space [Suzuki, JMFM, 2012]
$$\bex \sen{\pi}_{L^{s,\infty}(0,T;L^{q,\infty}(\bbR^3))} \leq \ve_*, \eex$$ with $$\bex \frac{2}{s}+\frac{3}{q}=2,\quad \frac{5}{2}\leq q\leq 3.
648 0
[Papers]NSE, $u_3$, Lebesgue space [NNP, QM, 2002; Zhou, JMPA, 2005]
$$\bex u_3\in L^p(0,T;L^q(\bbR^3)),\quad \frac{2}{p}+\frac{3}{q}=\frac{1}{2},\quad 6< q\leq \infty. \eex$$
733 0
[Papers]NSE, $\p_3u$, Lebesgue space [Kukavica-Ziane, JMP, 2007]
$$\bex \p_3\bbu\in L^p(0,T;L^q(\bbR^3)),\quad \frac{2}{p}+\frac{3}{q}=2,\quad \frac{9}{4}\leq q\leq 3. \eex$$
774 0
|
Python
[Papers]NSE, $\pi$, Lorentz space [Suzuki, NA, 2012]
$$\bex \sen{\pi}_{L^{s,\infty}(0,T;L^{q,\infty}(\bbR^3))} \leq \ve_*, \eex$$ with $$\bex \frac{2}{s}+\frac{3}{q}=2,\quad 3< q
572 0
[Papers]NSE, $u_3$, Lebesgue space [Zhou-Pokorny, Nonlinearity, 2009]
$$\bex u_3\in L^p(0,T;L^q(\bbR^3)),\quad \frac{2}{p}+\frac{3}{q}=\frac{3}{4}+\frac{1}{2q},\quad \frac{10}{3}
591 0
[Papers]NSE, $u_3$, Lebesgue space [Cao-Titi, IUMJ, 2008]
$$\bex u_3\in L^p(0,T;L^q(\bbR^3)),\quad \frac{2}{p}+\frac{3}{q}=\frac{2}{3}+\frac{2}{3q},\quad \frac{7}{2}
879 0