58 lines
1.6 KiB
Python
Executable File
58 lines
1.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
import argparse
|
|
import json
|
|
|
|
import eca.arff
|
|
|
|
def file_type(mode):
|
|
"""Acts as an output file type for argparse. Always uses utf-8 encoding."""
|
|
def handler(name):
|
|
if name == '-':
|
|
import sys
|
|
if 'r' in mode:
|
|
return sys.stdin
|
|
elif 'w' in mode:
|
|
return sys.stdout
|
|
else:
|
|
raise argparse.ArgumentTypeError("can't use mode '{}' for stdin/stdout".format(mode))
|
|
|
|
try:
|
|
return open(name, mode, encoding='utf-8')
|
|
except OSError as e:
|
|
raise argparse.ArgumentTypeError("can't open '{}': {}".format(name, e))
|
|
|
|
return handler
|
|
|
|
def rows(tweets):
|
|
"""
|
|
This Generator function takes an opened data file with one JSON object
|
|
representing a tweet per line, and generates a row for each tweet.
|
|
"""
|
|
for line in tweets:
|
|
tweet = json.loads(line)
|
|
yield {'tweet': tweet['text']}
|
|
|
|
def main():
|
|
"""
|
|
Main program entry point.
|
|
"""
|
|
parser = argparse.ArgumentParser(description='Tweet data to ARFF converter')
|
|
parser.add_argument('file', type=file_type('r'), help='Twitter data source')
|
|
parser.add_argument('output', type=file_type('w'), help='Output file')
|
|
args = parser.parse_args()
|
|
|
|
# attribute description
|
|
fields = [
|
|
eca.arff.Field('tweet', eca.arff.Text()),
|
|
eca.arff.Field('@@class@@', eca.arff.Nominal(['a','b','c']))
|
|
]
|
|
|
|
# create item generator
|
|
items = rows(args.file)
|
|
|
|
eca.arff.save(args.output, fields, items, name='ARFF for '+args.file.name)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|