123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342(**************************************************************************)(* *)(* OCaml *)(* *)(* Ulysse Gérard, Thomas Refis, Tarides *)(* Nathanaëlle Courant, OCamlPro *)(* Gabriel Scherer, projet Picube, INRIA Paris *)(* *)(* Copyright 2021 Institut National de Recherche en Informatique et *)(* en Automatique. *)(* *)(* All rights reserved. This file is distributed under the terms of *)(* the GNU Lesser General Public License version 2.1, with the *)(* special exception on linking described in the file LICENSE. *)(* *)(**************************************************************************)openShapetyperesult=|ResolvedofUid.t|Resolved_aliasofUid.t*result|Unresolvedoft|ApproximatedofUid.toption|Internal_error_missing_uidletrecprint_resultfmtresult=matchresultwith|Resolveduid->Format.fprintffmt"@[Resolved: %a@]@;"Uid.printuid|Resolved_alias(uid,r)->Format.fprintffmt"@[Alias: %a -> %a@]@;"Uid.printuidprint_resultr|Unresolvedshape->Format.fprintffmt"@[Unresolved: %a@]@;"printshape|Approximated(Someuid)->Format.fprintffmt"@[Approximated: %a@]@;"Uid.printuid|ApproximatedNone->Format.fprintffmt"@[Approximated: No uid@]@;"|Internal_error_missing_uid->Format.fprintffmt"@[Missing uid@]@;"letfind_shapeenvid=letnamespace=Shape.Sig_component_kind.ModuleinEnv.shape_of_path~namespaceenv(Pidentid)moduleMake(Params:sigvalfuel:intvalread_unit_shape:unit_name:string->toptionend)=struct(* We implement a strong call-by-need reduction, following an
evaluator from Nathanaelle Courant. *)typenf={uid:Uid.toption;desc:nf_desc;approximated:bool}andnf_desc=|NVarofvar|NAppofnf*nf|NAbsoflocal_env*var*t*delayed_nf|NStructofdelayed_nfItem.Map.t|NAliasofdelayed_nf|NProjofnf*Item.t|NLeaf|NComp_unitofstring|NErrorofstring(* A type of normal forms for strong call-by-need evaluation.
The normal form of an abstraction
Abs(x, t)
is a closure
NAbs(env, x, t, dnf)
when [env] is the local environment, and [dnf] is a delayed
normal form of [t].
A "delayed normal form" is morally equivalent to (nf Lazy.t), but
we use a different representation that is compatible with
memoization (lazy values are not hashable/comparable by default
comparison functions): we represent a delayed normal form as
just a not-yet-computed pair [local_env * t] of a term in a
local environment -- we could also see this as a term under
an explicit substitution. This delayed thunked is "forced"
by calling the normalization function as usual, but duplicate
computations are precisely avoided by memoization.
*)anddelayed_nf=Thunkoflocal_env*tandlocal_env=delayed_nfoptionIdent.Map.t(* When reducing in the body of an abstraction [Abs(x, body)], we
bind [x] to [None] in the environment. [Some v] is used for
actual substitutions, for example in [App(Abs(x, body), t)], when
[v] is a thunk that will evaluate to the normal form of [t]. *)letapprox_nfnf={nfwithapproximated=true}letin_memo_tablememo_tablememo_keyfarg=matchHashtbl.findmemo_tablememo_keywith|res->res|exceptionNot_found->letres=farginHashtbl.replacememo_tablememo_keyres;restypeenv={fuel:intref;global_env:Env.t;local_env:local_env;reduce_memo_table:(local_env*t,nf)Hashtbl.t;read_back_memo_table:(nf,t)Hashtbl.t;}letbindenvvarshape={envwithlocal_env=Ident.Map.addvarshapeenv.local_env}letrecreduce_envt=letlocal_env=env.local_envinletmemo_key=(local_env,t)inin_memo_tableenv.reduce_memo_tablememo_key(reduce__env)t(* Memoization is absolutely essential for performance on this
problem, because the normal forms we build can in some real-world
cases contain an exponential amount of redundancy. Memoization
can avoid the repeated evaluation of identical subterms,
providing a large speedup, but even more importantly it
implicitly shares the memory of the repeated results, providing
much smaller normal forms (that blow up again if printed back
as trees). A functor-heavy file from Irmin has its shape normal
form decrease from 100Mio to 2.5Mio when memoization is enabled.
Note: the local environment is part of the memoization key, while
it is defined using a type Ident.Map.t of non-canonical balanced
trees: two maps could have exactly the same items, but be
balanced differently and therefore hash differently, reducing
the effectivenss of memoization.
This could in theory happen, say, with the two programs
(fun x -> fun y -> ...)
and
(fun y -> fun x -> ...)
having "the same" local environments, with additions done in
a different order, giving non-structurally-equal trees. Should we
define our own hash functions to provide robust hashing on
environments?
We believe that the answer is "no": this problem does not occur
in practice. We can assume that identifiers are unique on valid
typedtree fragments (identifier "stamps" distinguish
binding positions); in particular the two program fragments above
in fact bind *distinct* identifiers x (with different stamps) and
different identifiers y, so the environments are distinct. If two
environments are structurally the same, they must correspond to
the evaluation evnrionments of two sub-terms that are under
exactly the same scope of binders. So the two environments were
obtained by the same term traversal, adding binders in the same
order, giving the same balanced trees: the environments have the
same hash.
*)andforceenv(Thunk(local_env,t))=reduce_{envwithlocal_env}tandreduce__({fuel;global_env;local_env;_}asenv)(t:t)=letreduceenvt=reduce_envtinletdelay_reduceenvt=Thunk(env.local_env,t)inletreturndesc={uid=t.uid;desc;approximated=t.approximated}inletrecforce_aliasesnf=matchnf.descwith|NAliasdelayed_nf->letnf=forceenvdelayed_nfinforce_aliasesnf|_->nfinletreset_uid_if_new_bindingt'=matcht.uidwith|None->t'|Some_asuid->{t'withuid}inif!fuel<0thenapprox_nf(return(NError"NoFuelLeft"))elsematcht.descwith|Comp_unitunit_name->beginmatchParams.read_unit_shape~unit_namewith|Somet->reduceenvt|None->return(NComp_unitunit_name)end|App(f,arg)->letf=reduceenvf|>force_aliasesinbeginmatchf.descwith|NAbs(clos_env,var,body,_body_nf)->letarg=delay_reduceenvarginletenv=bind{envwithlocal_env=clos_env}var(Somearg)inreduceenvbody|>reset_uid_if_new_binding|_->letarg=reduceenvarginreturn(NApp(f,arg))end|Proj(str,item)->letstr=reduceenvstr|>force_aliasesinletnored()=return(NProj(str,item))inbeginmatchstr.descwith|NStruct(items)->beginmatchItem.Map.finditemitemswith|exceptionNot_found->nored()|nf->forceenvnf|>reset_uid_if_new_bindingend|_->nored()end|Abs(var,body)->letbody_nf=delay_reduce(bindenvvarNone)bodyinreturn(NAbs(local_env,var,body,body_nf))|Varid->beginmatchIdent.Map.findidlocal_envwith(* Note: instead of binding abstraction-bound variables to
[None], we could unify it with the [Some v] case by
binding the bound variable [x] to [NVar x].
One reason to distinguish the situations is that we can
provide a different [Uid.t] location; for bound
variables, we use the [Uid.t] of the bound occurrence
(not the binding site), whereas for bound values we use
their binding-time [Uid.t]. *)|None->return(NVarid)|Somedef->beginmatchforceenvdefwith|{uid=Some_;_}asnf->nf(* This var already has a binding uid *)|{uid=None;_}asnf->{nfwithuid=t.uid}(* Set the var's binding uid *)end|exceptionNot_found->matchfind_shapeglobal_envidwith|exceptionNot_found->return(NVarid)|reswhenres=t->return(NVarid)|res->decrfuel;reduceenvresend|Leaf->returnNLeaf|Structm->letmnf=Item.Map.map(delay_reduceenv)minreturn(NStructmnf)|Aliast->return(NAlias(delay_reduceenvt))|Errors->approx_nf(return(NErrors))andread_backenv(nf:nf):t=in_memo_tableenv.read_back_memo_tablenf(read_back_env)nf(* The [nf] normal form we receive may contain a lot of internal
sharing due to the use of memoization in the evaluator. We have
to memoize here again, otherwise the sharing is lost by mapping
over the term as a tree. *)andread_back_env(nf:nf):t={uid=nf.uid;desc=read_back_descenvnf.desc;approximated=nf.approximated}andread_back_descenvdesc=letread_backnf=read_backenvnfinletread_back_forcednf=read_back(forceenvdnf)inmatchdescwith|NVarv->Varv|NApp(nft,nfu)->App(read_backnft,read_backnfu)|NAbs(_env,x,_t,nf)->Abs(x,read_back_forcenf)|NStructnstr->Struct(Item.Map.mapread_back_forcenstr)|NAliasnf->Alias(read_back_forcenf)|NProj(nf,item)->Proj(read_backnf,item)|NLeaf->Leaf|NComp_units->Comp_units|NErrors->Errors(* Sharing the memo tables is safe at the level of a compilation unit since
idents should be unique *)letreduce_memo_table=Hashtbl.create42letread_back_memo_table=Hashtbl.create42letreduceglobal_envt=letfuel=refParams.fuelinletlocal_env=Ident.Map.emptyinletenv={fuel;global_env;reduce_memo_table=reduce_memo_table;read_back_memo_table=read_back_memo_table;local_env;}inreduce_envt|>read_backenvletrecis_stuck_on_comp_unit(nf:nf)=matchnf.descwith|NVar_->(* This should not happen if we only reduce closed terms *)false|NApp(nf,_)|NProj(nf,_)->is_stuck_on_comp_unitnf|NStruct_|NAbs_->false|NAlias_->false|NComp_unit_->true|NError_->false|NLeaf->falseletrecreduce_aliases_for_uidenv(nf:nf)=matchnfwith|{uid=Someuid;desc=NAliasdnf;approximated=false;_}->letresult=reduce_aliases_for_uidenv(forceenvdnf)inResolved_alias(uid,result)|{uid=Someuid;approximated=false;_}->Resolveduid|{uid;approximated=true}->Approximateduid|{uid=None;approximated=false;_}->(* A missing Uid after a complete reduction means the Uid was first
missing in the shape which is a code error. Having the
[Missing_uid] reported will allow Merlin (or another tool working
with the index) to ask users to report the issue if it does happen.
*)Internal_error_missing_uidletreduce_for_uidglobal_envt=letfuel=refParams.fuelinletlocal_env=Ident.Map.emptyinletenv={fuel;global_env;reduce_memo_table=reduce_memo_table;read_back_memo_table=read_back_memo_table;local_env;}inletnf=reduce_envtinifis_stuck_on_comp_unitnfthenUnresolved(read_backenvnf)elsereduce_aliases_for_uidenvnfendmoduleLocal_reduce=Make(structletfuel=10letread_unit_shape~unit_name:_=Noneend)letlocal_reduce=Local_reduce.reduceletlocal_reduce_for_uid=Local_reduce.reduce_for_uid