Modify the word count query so that the streaming query only returns results where the word count
Question:
Modify the word count query so that the streaming query only returns results where the word count is greater than two.
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
spark = SparkSession
.builder
.appName("Assignment 7.1")
.getOrCreate()
lines = spark
.readStream
.format("socket")
.option("host", "localhost")
.option("port", 9999)
.load()
# Split the lines into words
words = lines.select(
explode(
split(lines.value, " ")
).alias("word")
)
# Generate running word count
wordCounts = words.groupBy("word").count()
try:
query = wordCounts
.writeStream
.outputMode("complete")
.format("console")
.start()
query.awaitTermination()
except KeyboardInterrupt:
print('Stopping query')