Question: NEED HELP WITH CODED below this my code and the problem... and the dataset We will use a full day worth of tweets as an
NEED HELP WITH CODED below this my code and the problem... and the dataset
We will use a full day worth of tweets as an input (there are total of 4.4M tweets in this file, but you only need to read 1M): http://rasinsrv07.cstcis.cti.depaul.edu/CSC455/OneDayOfTweets.txt
Export the contents of the User table from a SQLite table into a sequence of INSERT statements within a file. This is very similar to what you did in Assignment 4. However, you have to add a unique ID column which has to be a string (you cannot use numbers). Hint: you can replace digits with letters, e.g., chr(ord('a')+1) gives you a 'b' and chr(ord('a')+2) returns a 'c'
import urllib, time, json, sqlite3
conn = sqlite3.connect('Tweets_Database.db') c = conn.cursor()
wFD = urllib.request.urlopen('http://rasinsrv07.cstcis.cti.depaul.edu/CSC455/Assignment4.txt') createTable = '''CREATE TABLE Tweets (
ID NUMBER(20),
Created_At DATE,
Text CHAR(140),
Source VARCHAR(200) DEFAULT NULL,
In_Reply_to_User_ID NUMBER(20),
In_Reply_to_Screen_Name VARCHAR(60),
In_Reply_to_Status_ID NUMBER(20),
Retweet_Count NUMBER(10),
Contributors VARCHAR(200),
CONSTRAINT Tweets_PK PRIMARY KEY (id)
);'''
Createuser ='''CREATE TABLE user (
id INT,
screen_name TEXT,
description TEXT,
friends_count INT,
contributors TEXT
CONSTRAINT USER_PK
PRIMARY KEY(ID)
);'''
c.execute('DROP TABLE IF EXISTS Tweets'); c.execute(createTable)
c.execute(createuser)
def loadTweets(tweetLines):
# Collect multiple rows so that we can use "executemany".
# We insert batchRows records at a time in this function.
batchRows = 10
batchedInserts = []
# as long as there is at least one line remaining
while len(tweetLines) > 0:
line = tweetLines.pop(0) # take off the first element from the list, removing it
tweetDict = json.loads(line)
newRowuser = []
newRow = [] # hold individual values of to-be-inserted row
tweetKeys = [ID,'id_str','created_at','text','source','in_reply_to_user_id', 'in_reply_to_screen_name', 'in_reply_to_status_id', 'retweet_count', 'contributors']
for key in tweetKeys:
# Treat '', [] and 'null' as NULL
if tweetDict[key] in ['',[],'null']:
newRow.append(None)
else:
newRow.append(tweetDict[key])
Userkey= [id,screen_name, description, friends_count, contributors]
# Add the new row to the collected batch
for key in Userkey:
# Treat '', [] and 'null' as NULL
if tweetDict[key] in ['',[],'null']:
newRowuser.append(None)
else:
newRowuser.append(tweetDict[Userkey])
batchedInserts.append(newRow)
batchedInserts_2.append(newRow)
# If we have reached # of batchRows, use executemany to insert what we collected
# so far, and reset the batchedInserts list back to empty
if len(batchedInserts) >= batchRows or len(tweetLines) == 0:
c.executemany('INSERT INTO Tweets VALUES(?,?,?,?,?,?,?,?,?)', batchedInserts) # Reset the batching process
batchedInserts = []
if len(batchedInserts_2) >= batchRows or len(tweetLines) == 0:
c.executemany('INSERT INTO user VALUES(?,?,?,?,?,?,?,?,?)', batchedInserts_2) # Reset the batching process
batchedInserts_2 = []
start = time.time()
tweets = wFD.read()
loadTweets( tweets.decode('utf8').split('EndOfTweet') ) end = time.time()
print ("loadTweets took ", (end-start), ' seconds.') print ("Loaded ", c.execute('SELECT COUNT(*) FROM Tweets').fetchall()[0], " rows") wFD.close()
c.close()
conn.commit()
conn.close()
Step by Step Solution
There are 3 Steps involved in it
Get step-by-step solutions from verified subject matter experts
