Using awk:
$ awk 'max<NF{max=NF} # Get max number of columns
{ #For every input line,
for(i=1;i<=NF;i++){
b[$i]++; # Record all possible tokens, like foo, bar etc.
a[i$i]++; # Record their column indices
}
}
END{
for(i in b) #Get max length of all the tokens (for printing)
if(c<length(i))
c=length(i);
for(i in b) # For each token,
{
for(j=1;j<=max;j++){ # For every column,
if(a[j i]) d = i; # Decide, if we want to print it, or left blank...
else d="";
printf "%-"(c+5)"s", d; # Print the token, or blank space
}
print ""; # Print newline after every tokens line.
}
}' test.input
foo foo
baz baz
qux qux
bar bar
Regarding the order of the input vs output data: I don't think there is any input tokens order, because below input data should also give the similar output.
foo foo
bar
baz baz bar
qux qux
It is possible to maintain the order of the token, in which they first appeared. e.g. in above (reordered) case, it would be foo, bar, baz, qux.
$ awk 'max<NF{max=NF} # Get max number of columns
{ #For every input line,
for(i=1;i<=NF;i++){
if(!b[$i]++)
token[j++]=$i;
a[i$i]++; # Record their column indices
}
}
END{
for(i in b) #Get max length of all the tokens (for printing)
if(max_len<length(i))
max_len=length(i);
PROCINFO["sorted_in"] = "@ind_num_asc";
for(i in token) { # For each token,
for(j=1;j<=max;j++){ # For every column,
if(a[j token[i]]) d = token[i]; # Decide, if we want to print it, or left blank...
else d="";
printf "%-"(max_len+5)"s", d; # Print the token, or blank space
}
print ""; # Print newline after every tokens line.
}
}' test.input.reordered
foo foo
bar bar
baz baz
qux qux
与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…