1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158 | #!/usr/bin/env python
"""
Tagging golden data in terminal.
The script will randomly sample candidate pairs from blocked data, and show the
candidate pair for tagging in color coded form waiting for user input (y/n).
Default being no.
Usage: golden.py <blocked-cand-pairs[.csv]> <start-cand-id> <end-cand-id>
"""
import sys
import csv
import numpy as np
import pickle
import logging
from blessings import Terminal
def csvquote(value):
"""CSV quote row value. Quote values with comma and quotes. Escape quotes in
the row value with two quotes.
>>> csvquote('abc')
u'abc'
>>> csvquote('a, b, c')
u'"a, b, c"'
>>> csvquote('a "b" c')
u'"a ""b"" c"'
>>> csvquote('a\\nb') # Escape for doctest
u'"a\\nb"'
>>> csvquote('a, \\n"b", c') # Escape for doctest
u'"a, \\n""b"", c"'
>>> csvquote(1)
u'1'
"""
value = '{0}'.format(value)
value = unicode(value, 'utf-8')
value = value.replace('\r', ' ')
quote = False
if value.find(u'"') != -1:
value = value.replace('"', '""')
quote = True
if value.find(u',') != -1 or value.find(u'\n') != -1:
quote = True
if quote:
return u'"' + value + u'"'
else:
return value
class CsvDataFile:
"""Load the CSV data file with header and rows.
"""
def __init__(self, path):
"""Constructor.
Args:
path: Path to the CSV file.
"""
self.header = {}
self.rows = []
with open(path, 'rb') as csvfile:
header = None
reader = csv.reader(csvfile)
for row in reader:
if not header:
header = row
else:
self.rows.append(row)
for i, col in zip(xrange(len(header)), header):
self.header[col] = i
def write_header(header, golden = 'golden.csv'):
"""Write header to golden data.
"""
header_list = ['']*len(header)
for k, v in header.iteritems():
header_list[v] = k
with open(golden, 'ab') as log:
line = u','.join(header_list + ['label:INTEGER']) + u'\n'
log.write(line.encode('utf-8'))
def write_golden_pair(row, answer, golden = 'golden.csv'):
"""Write a tagged golden data pair.
"""
with open(golden, 'ab') as log:
line = u','.join(map(csvquote, row + [answer])) + u'\n'
log.write(line.encode('utf-8'))
#with open(golden, 'ab') as pkl:
# pickle.dump(row + [answer], pkl)
def show_cand_pair(header, row, current = None):
"""Show candidate pair for tagging to user and write output.
"""
t = Terminal()
print '{t.clear}{t.black}\n'.format(t=t)
print '{t.blue}{t.bold}Google Play'.format(t=t)
print '{t.red}{t.bold}iTunes'.format(t=t)
print
if current: print '{t.bold}{t.black} [{current}]'.format(t=t,
current=current)
print '{t.bold}{t.black} NAME {t.normal}{t.blue}{name}'.format(
t=t, name=row[header['google-play.name:TEXT']])
print ' {t.normal}{t.red}{name}'.format(
t=t, name=row[header['itunes.name:TEXT']])
print '{t.bold}{t.black}YEAR {t.normal}{t.blue}{year0} {t.red}{year1}'.format(
t=t, year0=row[header['google-play.year:INTEGER']],
year1=row[header['itunes.year:INTEGER']])
print '{t.bold}{t.black}ACTORS {t.normal}{t.blue}{name}'.format(
t=t, name=row[header['google-play.actors:TEXT']])
print ' {t.normal}{t.red}{name}'.format(
t=t, name=row[header['itunes.actors:TEXT']])
print '{t.bold}{t.black}DIRECTORS {t.normal}{t.blue}{name}'.format(
t=t, name=row[header['google-play.directors:TEXT']])
print ' {t.normal}{t.red}{name}'.format(
t=t, name=row[header['itunes.directors:TEXT']])
print '{t.bold}{t.black}URL {t.normal}{t.blue}{name}'.format(
t=t, name='https://play.google.com/store/movies/details?id=' + \
row[header['google-play.id:TEXT']])
print ' {t.normal}{t.red}{name}'.format(
t=t, name='https://itunes.apple.com/us/movie/x/id' + \
row[header['itunes.id:TEXT']])
print '{t.normal}'.format(t=t)
answer = raw_input('Match? y/[n]: ')
if answer == 'y':
answer = 1
else:
answer = 0
write_golden_pair(row, answer)
def tag_golden(data, start, end, seen = None):
"""Sample cand pair in [start, end) range and tag them.
"""
logging.info('Sampling randomly ...')
rows = set(xrange(start, end))
if seen:
rows = rows - seen
rows = np.array(list(rows))
np.random.shuffle(rows)
logging.info('Starting to show candidate pairs randomly ...')
current = 1
write_header(data.header)
for rowid in rows:
show_cand_pair(data.header, data.rows[rowid], current)
current += 1
def main():
"""Main function.
"""
logging.basicConfig(level=logging.DEBUG)
logging.info('Loading data ...')
data = CsvDataFile(sys.argv[1])
assert len(data.rows) == int(data.rows[-1][0]) + 1
tag_golden(data, int(sys.argv[2]), int(sys.argv[3]))
if __name__ == '__main__':
main()
|