Source code for permute.qa

""" Quality assurance and data cleaning.
"""


import numpy as np


[docs]def find_duplicate_rows(x, as_string=False): r""" Find rows which are duplicated in x Notes ----- If you load a file, for example `nsgk.csv`, as a 2D array, say `x`, then if you found '16,20,2,8' in the list returned by ``find_duplicate_rows(x, as_string=True)`` you might do something like:: $ grep -n --context=1 '16,20,2,8' nsgk.csv 12512-16,15,2,8 12513:16,20,2,8 12514-16,45,2,8 -- 12532-17,17,2,8 12533:16,20,2,8 12534-17,24,2,8 http://stackoverflow.com/questions/8560440/removing-duplicate-columns-and-rows-from-a-numpy-2d-array """ indx = np.lexsort(x.T) x = x[indx] diff = np.diff(x, axis=0) indx = np.any(diff, axis=1) dups = x[1:, :][~indx, ] if as_string: dups = [",".join([str(c) for c in r.tolist()]) for r in dups] return dups
[docs]def find_consecutive_duplicate_rows(x, as_string=False): r""" Find rows which are duplicated in x """ indx = [] prev = x[0] for i, r in enumerate(x[1:]): if (r == prev).all(): indx.append(i) prev = r dups = x[indx] if as_string: dups = [",".join([str(c) for c in r.tolist()]) for r in dups] return dups