#!/usr/bin/awk # input1: the imputed data # input2: the dictionary of the original data # input3: the dictionary of the imputed data # output: renames the imputed values (not in dictionary) with increasing items BEGIN{ original = ARGV[1] imputed = ARGV[2] data = ARGV[3] col = -1 maxItem = -1 FS = "," } # store the ids of the original columns FILENAME == original && /\#/ { col ++ cols[col] = $0 next } # store the original values FILENAME == original { values[col] = values[col] " " $0 maxItem = $0 next } # first line of imputed dictionary FILENAME == imputed && FNR == 1{ col = -1 } # compute the current list of values FILENAME == imputed && /\#/{ col ++; split(values[col], tab, " ") split("", tabValues) for (t in tab) tabValues[tab[t]] = 1 next } FILENAME == imputed{ if (! ($0 in tabValues)){ maxItem ++ newValues[(col + 1) ":" $0] = maxItem } next } FILENAME == data && FNR == 1{ } FILENAME == data{ for (i = 1; i <= NF; i ++) if ((i ":" $i) in newValues) printf("%d ", newValues[i ":" $i]) else printf("%s ", $i) printf("\n") } END{ }