Changed tweets, added eca
This commit is contained in:
384
eca/arff.py
Normal file
384
eca/arff.py
Normal file
@@ -0,0 +1,384 @@
|
||||
"""
|
||||
ARFF format loading and saving module.
|
||||
|
||||
This module implements the book version [1] of the ARFF format. This means there
|
||||
is no support for instance weights.
|
||||
|
||||
Known limitations:
|
||||
- This implementation does not parse dates
|
||||
|
||||
[1]: http://weka.wikispaces.com/ARFF+%28book+version%29
|
||||
"""
|
||||
|
||||
import re
|
||||
from collections import namedtuple
|
||||
|
||||
Field = namedtuple('Field',['name','type'])
|
||||
|
||||
__all__ = ['load', 'save', 'Field', 'Numeric', 'Text', 'Nominal']
|
||||
|
||||
|
||||
#
|
||||
# Line type functions
|
||||
#
|
||||
|
||||
def is_empty(line):
|
||||
return not line.strip()
|
||||
|
||||
def is_comment(line):
|
||||
return line.startswith('%')
|
||||
|
||||
def format_comment(line):
|
||||
return '% '+line
|
||||
|
||||
def is_relation(line):
|
||||
return line.lower().startswith('@relation')
|
||||
|
||||
def format_relation(name):
|
||||
return '@relation ' + format_identifier(name) + '\n'
|
||||
|
||||
def is_attribute(line):
|
||||
return line.lower().startswith('@attribute')
|
||||
|
||||
def format_attribute(field):
|
||||
return '@attribute ' + format_identifier(field.name) + ' ' + str(field.type) + '\n'
|
||||
|
||||
def format_attributes(fields):
|
||||
result = []
|
||||
for field in fields:
|
||||
result.append(format_attribute(field))
|
||||
return ''.join(result)
|
||||
|
||||
def is_data(line):
|
||||
return line.lower().startswith('@data')
|
||||
|
||||
def format_data():
|
||||
return '@data\n'
|
||||
|
||||
def format_row(row, fields, sparse=False):
|
||||
"""Formats a data row based on the given fields."""
|
||||
if sparse:
|
||||
result = []
|
||||
for i in range(len(fields)):
|
||||
field = fields[i]
|
||||
val = row.get(field.name)
|
||||
if val != field.type.default():
|
||||
result.append(format_numeric(i) + ' ' + field.type.format(val))
|
||||
return '{' + ','.join(result) + '}\n'
|
||||
else:
|
||||
result = []
|
||||
for field in fields:
|
||||
result.append(field.type.format(row.get(field.name)))
|
||||
return ','.join(result)+'\n'
|
||||
|
||||
|
||||
def safe_next(it):
|
||||
"""Returns the next character from the iterator or ''."""
|
||||
try:
|
||||
return next(it)
|
||||
except StopIteration:
|
||||
return ''
|
||||
|
||||
|
||||
def whitespace(rest):
|
||||
"""Parses whitespace at the beginning of the input."""
|
||||
return rest.lstrip()
|
||||
|
||||
|
||||
number_pattern = re.compile(r'[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?')
|
||||
|
||||
def numeric(rest):
|
||||
"""Parses a number at the beginning of the input."""
|
||||
m = number_pattern.match(rest)
|
||||
if m:
|
||||
rest = rest[len(m.group(0)):]
|
||||
try:
|
||||
number = int(m.group(0))
|
||||
except ValueError:
|
||||
number = float(m.group(0))
|
||||
return number, rest
|
||||
else:
|
||||
raise ValueError('Number not parsable')
|
||||
|
||||
def format_numeric(number):
|
||||
"""Outputs a number."""
|
||||
return str(number)
|
||||
|
||||
def expect(rest, string):
|
||||
"""Expects to see the string at the start of the input."""
|
||||
result = rest.startswith(string)
|
||||
if result:
|
||||
return result, rest[len(string):]
|
||||
else:
|
||||
return False, rest
|
||||
|
||||
|
||||
identifier_escapes = {
|
||||
'\\': '\\',
|
||||
'n' : '\n',
|
||||
't' : '\t',
|
||||
'r' : '\r',
|
||||
'%' : '%',
|
||||
"'" : "'"
|
||||
}
|
||||
def identifier(rest):
|
||||
"""Parses an optionally quoted identifier at the start of the input."""
|
||||
name = ''
|
||||
|
||||
it = iter(rest)
|
||||
c = safe_next(it)
|
||||
|
||||
# non-quoted
|
||||
if c != "'":
|
||||
while c and c not in [' ', '\t', ',']:
|
||||
name += c
|
||||
c = safe_next(it)
|
||||
return name, c + ''.join(it)
|
||||
|
||||
# quoted
|
||||
|
||||
# discard the opening quote by fetching next character
|
||||
c = safe_next(it)
|
||||
while c:
|
||||
if c == '\\':
|
||||
ec = safe_next(it)
|
||||
if not ec:
|
||||
raise ValueError('Input end during escape.')
|
||||
try:
|
||||
name += identifier_escapes[ec]
|
||||
except KeyError:
|
||||
name += '\\' + ec
|
||||
elif c == "'":
|
||||
break
|
||||
else:
|
||||
name += c
|
||||
c = safe_next(it)
|
||||
return name, ''.join(it)
|
||||
|
||||
def format_identifier(name):
|
||||
"""Formats an identifier."""
|
||||
reverse_escapes = { c:ec for (ec,c) in identifier_escapes.items()}
|
||||
if any(x in name for x in [' ',','] + list(reverse_escapes.keys())):
|
||||
escaped = ''
|
||||
for c in name:
|
||||
if c in reverse_escapes:
|
||||
escaped += '\\' + reverse_escapes[c]
|
||||
else:
|
||||
escaped += c
|
||||
return "'"+escaped+"'"
|
||||
|
||||
return name
|
||||
|
||||
class Numeric:
|
||||
"""Numeric field type."""
|
||||
def parse(self, rest):
|
||||
if rest.startswith('?'):
|
||||
return None, rest[1:]
|
||||
|
||||
return numeric(rest)
|
||||
|
||||
def format(self, number):
|
||||
if number is None:
|
||||
return '?'
|
||||
else:
|
||||
return format_numeric(number)
|
||||
|
||||
def default(self):
|
||||
return 0
|
||||
|
||||
def __repr__(self):
|
||||
return 'Numeric'
|
||||
|
||||
def __str__(self):
|
||||
return 'numeric'
|
||||
|
||||
|
||||
class Text:
|
||||
"""Text field type."""
|
||||
def parse(self, rest):
|
||||
if rest.startswith('?'):
|
||||
return None, rest[1:]
|
||||
|
||||
return identifier(rest)
|
||||
|
||||
def format(self, name):
|
||||
if name is None:
|
||||
return '?'
|
||||
else:
|
||||
return format_identifier(name)
|
||||
|
||||
def default(self):
|
||||
return ''
|
||||
|
||||
def __repr__(self):
|
||||
return 'Text'
|
||||
|
||||
def __str__(self):
|
||||
return 'string'
|
||||
|
||||
|
||||
class Nominal:
|
||||
"""Nominal field type."""
|
||||
def __init__(self, names):
|
||||
self.values = names
|
||||
|
||||
def parse(self, rest):
|
||||
if rest.startswith('?'):
|
||||
return None, rest[1:]
|
||||
|
||||
name, rest = identifier(rest)
|
||||
if name in self.values:
|
||||
return name, rest
|
||||
else:
|
||||
raise ValueError('Unknown nominal constant "{}" for {}.'.format(name, self.values))
|
||||
|
||||
def format(self, name):
|
||||
if name is None:
|
||||
return '?'
|
||||
else:
|
||||
if name not in self.values:
|
||||
raise ValueError('Unknown nominal constant "{}" for {}.'.format(name, self.values))
|
||||
return format_identifier(name)
|
||||
|
||||
def default(self):
|
||||
return self.values[0]
|
||||
|
||||
def __repr__(self):
|
||||
return 'Nominal in {}'.format(self.values)
|
||||
|
||||
def __str__(self):
|
||||
return '{' + ', '.join(format_identifier(name) for name in self.values) + '}'
|
||||
|
||||
|
||||
def attr_type(rest):
|
||||
"""Parses a field type. Uses the whole rest."""
|
||||
if rest.lower() in ['numeric', 'integer', 'real']:
|
||||
return Numeric()
|
||||
elif rest.lower() in ['string']:
|
||||
return Text()
|
||||
elif rest.lower().startswith('date'):
|
||||
raise NotImplementedError('date parsing is not implemented.')
|
||||
elif rest.startswith('{') and rest.endswith('}'):
|
||||
names = []
|
||||
rest = rest[1:-1]
|
||||
while rest:
|
||||
rest = whitespace(rest)
|
||||
name, rest = identifier(rest)
|
||||
names.append(name)
|
||||
rest = whitespace(rest)
|
||||
seen, rest = expect(rest, ',')
|
||||
if not seen:
|
||||
break
|
||||
return Nominal(names)
|
||||
else:
|
||||
raise ValueError('Unknown attribute type "{}"'.format(rest))
|
||||
|
||||
|
||||
def parse_attribute(line):
|
||||
"""Parses an attribute line."""
|
||||
# @attribute WS name WS type
|
||||
rest = line[len('@attribute'):].strip()
|
||||
rest = whitespace(rest)
|
||||
name, rest = identifier(rest)
|
||||
rest = whitespace(rest)
|
||||
type = attr_type(rest)
|
||||
return name, type
|
||||
|
||||
|
||||
def parse_row(line, fields):
|
||||
"""Parses a row. Row can be normal or sparse."""
|
||||
line = line.strip()
|
||||
values = {}
|
||||
|
||||
if not line.startswith('{'):
|
||||
rest = line
|
||||
first = True
|
||||
for field in fields:
|
||||
if not first:
|
||||
rest = whitespace(rest)
|
||||
seen, rest = expect(rest, ',')
|
||||
first = False
|
||||
rest = whitespace(rest)
|
||||
value, rest = field.type.parse(rest)
|
||||
values[field.name] = value
|
||||
return values
|
||||
else:
|
||||
todo = set(range(len(fields)))
|
||||
rest = line[1:-1].strip()
|
||||
first = True
|
||||
while rest:
|
||||
if not first:
|
||||
rest = whitespace(rest)
|
||||
seen, rest = expect(rest, ',')
|
||||
if not seen:
|
||||
break
|
||||
first = False
|
||||
rest = whitespace(rest)
|
||||
index, rest = numeric(rest)
|
||||
field = fields[index]
|
||||
rest = whitespace(rest)
|
||||
value, rest = field.type.parse(rest)
|
||||
todo.remove(index)
|
||||
values[field.name] = value
|
||||
for field in (fields[i] for i in todo):
|
||||
values[field.name] = field.type.default()
|
||||
return values
|
||||
|
||||
|
||||
def load(fileish):
|
||||
"""
|
||||
Loads a data set from an arff formatted file-like object.
|
||||
|
||||
This generator function will parse the arff format's header to determine
|
||||
data shape. Each generated item is a single expanded row.
|
||||
|
||||
fileish -- a file-like object
|
||||
"""
|
||||
# parse header first
|
||||
lines = iter(fileish)
|
||||
fields = []
|
||||
|
||||
for line in lines:
|
||||
if is_empty(line) or is_comment(line):
|
||||
continue
|
||||
|
||||
if is_relation(line):
|
||||
# No care is given for the relation name.
|
||||
continue
|
||||
|
||||
if is_attribute(line):
|
||||
name, type = parse_attribute(line)
|
||||
fields.append(Field(name, type))
|
||||
continue
|
||||
|
||||
if is_data(line):
|
||||
# We are done with the header, next up is 1 row per line
|
||||
break
|
||||
|
||||
# parse data lines
|
||||
for line in lines:
|
||||
if is_empty(line) or is_comment(line):
|
||||
continue
|
||||
row = parse_row(line, fields)
|
||||
yield row
|
||||
|
||||
def save(fileish, fields, rows, name='unnamed relation', sparse=False):
|
||||
"""
|
||||
Saves an arff formatted data set to a file-like object.
|
||||
|
||||
The rows parameter can be any iterable. The fields parameter must be a list
|
||||
of `Field` instances.
|
||||
|
||||
fileish -- a file-like object to write to
|
||||
fields -- a list of `Field` instances
|
||||
rows -- an iterable containing one dictionary per data row
|
||||
name -- the relation name, defaults to 'unnamed relation'
|
||||
sparse -- whether the output should be in sparse format, defaults to False
|
||||
"""
|
||||
fileish.write(format_relation(name))
|
||||
fileish.write('\n')
|
||||
fileish.write(format_attributes(fields))
|
||||
fileish.write('\n')
|
||||
fileish.write(format_data())
|
||||
for row in rows:
|
||||
fileish.write(format_row(row, fields, sparse))
|
||||
Reference in New Issue
Block a user