(**************************************************************************)
(*                                                                        *)
(*  Copyright (C) Jean-Christophe Filliatre                               *)
(*                                                                        *)
(*  This software is free software; you can redistribute it and/or        *)
(*  modify it under the terms of the GNU Library General Public           *)
(*  described in file LICENSE.                                            *)
(*                                                                        *)
(*  This software is distributed in the hope that it will be useful,      *)
(*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *)
(*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                  *)
(*                                                                        *)
(**************************************************************************)

(*s {\bf Anagrams.} The following program finds all the anagrams of a given
set of characters among a dictionary. Such a dictionary can be built by
the following program given a list of files containing words (one per
line). *)

(*s The dictionary is implemented as a \emph{trie}. It is a multi-branching
tree, where branches are labelled with characters. Each node contains
a boolean which says if the word corresponding to the path from the root
belongs to the dictionary. The branches are implemented as maps from
characters to dictionaries. *)

module Cmap = Map.Make(struct type t = char let compare = compare end)

type tree = Node of bool * tree Cmap.t

let empty = Node (false, Cmap.empty)

(*s Insertion of a new word in the trie is just a matter of descending in
the tree, taking the branches corresponding to the successive characters.
Each time a branch does not exist, we continue the insertion in a new
empty tree. When the insertion is done in the subtree, we update the
branching to the new subtree. *)

let n = String.length w in
let rec addrec i (Node (b,m) as t) =
if i = n then
if b then t else Node (true,m)
else
let c = w.[i] in
let br = try Cmap.find c m with Not_found -> empty in
let t' = addrec (succ i) br in
Node (b, Cmap.add c t' m)
in

(*s Even if it is not necessary, here is the function [mem] to tests
whether a word belongs or not to the dictionary. *)

let mem t w =
let n = String.length w in
let rec look i (Node (b,m)) =
if i = n then
b
else
try look (succ i) (Cmap.find w.[i] m) with Not_found -> false
in
look 0 t

(*s The algorithm for anagrans is the following. We start from the root
of the tree with all the initial characters. Then, for each
\emph{distinct} character [c], we descend in the corresponding branch,
and apply recursively the algorithm with \emph{one occurrence} of [c] being
removed. When the collection of characters is empty, we simply test
the boolean at the current node. Whenever a branch is missing, we stop
the exploration.

It appears that we need to deal with \emph{multi-sets} of characters.
Indeed, we have to keep the collection of characters which have not yet
been examined, and it may contain repetitions. *)

(*s Multi-sets of characters are implemented as maps from characters to
positive integers. The operations of insertion and deletion are
easily implemented. We also provide a function [ms_of_string] to
build the multi-set corresponding to a given string. *)

try let n = Cmap.find c m in Cmap.add c (succ n) m
with Not_found -> Cmap.add c 1 m

let ms_remove c m =
let n = Cmap.find c m in
if n = 1 then Cmap.remove c m else Cmap.add c (pred n) m

let ms_of_string w =
let n = String.length w in
let rec add i = if i = n then Cmap.empty else ms_add w.[i] (add (succ i)) in

(*s Then implementing the above algorithm is rather easy. During the
exploration, we keep three values: first, the current path [pref]
from the root of the initial tree, in reverse order; secondly, the
current node being examined, [(b,m)]; and finally, the current
multi-set of characters [s]. *)

let subset = ref true

let rec print_prefix = function
| [] -> ()
| c::l -> print_prefix l; print_char c

let anagram d w =
let rec traverse pref (Node (b,m)) s =
if b && (s = Cmap.empty || !subset) then begin
print_prefix pref; print_newline ()
end;
Cmap.iter
(fun c _ ->
try traverse (c::pref) (Cmap.find c m) (ms_remove c s)
with Not_found -> ()) s
in
traverse [] d (ms_of_string w)

words contained in file [file] and inserts them in the tree [t].
Then function [build_dict] constructs the whole dictionary by
successively inserting the words for the given list of files. *)

Printf.printf "Reading %s\n" file; flush stdout;
let ch = open_in file in
try let w = input_line ch in read (add t w) with End_of_file -> t
in
let t' = read t in close_in ch; t'

let build_dict = List.fold_left add_one_file empty

(*s The following function [print_all] prints all the words of a given
dictionary. Only used for checks (option \texttt{-a}). *)

let print_all d =
let rec traverse pref (Node (b, m)) =
if b then begin print_prefix pref; print_newline () end;
Cmap.iter (fun c t -> traverse (c::pref) t) m
in
traverse [] d

(*s The main program. It mainly provides two ways of invoking the program:
first, the option \texttt{-b} will build the dictionary from the given
files and put it in the file ["dict.out"];
secondly, the program invoked with a word on the command line
will print all the anagrams for this word.
Option \texttt{-e} specifies exact anagrams (i.e., with all characters
used). *)

let output_dict d =
let ch = open_out "dict.out" in output_value ch d; close_out ch

let input_dict () =
let ch = open_in "dict.out" in let d = input_value ch in close_in ch; d

let usage () =
prerr_endline "usage:";
prerr_endline "  anagram -b files";
prerr_endline "  anagram [-e] word"

let main () =
match List.tl (Array.to_list Sys.argv) with
| [] | "-h" :: _ -> usage ()
| "-a" :: _ -> let d = input_dict () in print_all d
| "-b" :: files -> let d = build_dict files in output_dict d
| "-e" :: w :: _ -> subset := false; let d = input_dict () in anagram d w
| w :: _ -> let d = input_dict () in anagram d w

let _ = Printexc.catch main ()