########################################################################### #Programs for reading single, double and triple letter probabilities from a list of words L, where each word in L is a list of letters in the alphabet A. #Even though these use tables, and ultimately for the Goulden Jackson programs I want things in table form, I decided to have the output be structured as a list of lists (or a list of lists of lists). I find that usually I have to combine these things anyway, in order to get the tables for the application (I'll use one program for the digraph weights, one for the monograph weights, etc.). But note that all of the outputs below are indexed by numbers, NOT letters. ################################################################## #Useful for SingleGJ (and other single letter programs) ################################################################## #Single Letter Frequencies Single(L,A)[i] is the number of times the ith letter of A appears anywhere in the list, scaled down by the number of letters in the list. This is so that the total sum of the vector is 1, and we can think of these weights as probabilities. #This program can also be used to get the single letter weights for ProbDouble and ProbTriple programs. Single:=proc(L,A) local elt,F,H,num,S: F:=table([seq( A[H]=0, H=1..nops(A))]); S:=0; for elt in L do for num from 1 to nops(elt) do F[elt[num]]:=F[elt[num]]+1; S:=S+1 od: od: [seq(F[A[H]]/S, H=1..nops(A))]; end: ########################################################################### #Useful for ProbDoubleGJ (and other double letter applications): ########################################################################### #Double(L,A)[i][j] is the probability (in L) of the letter pair "jth letter of A, ith letter of A", given an occurrence of the ith letter of A. This is scaled, so that the sum of the terms over any fixed first letter is 1 (so we ignore occurrences of the letter at the end of a word). #This can be used to find the digraph probabilities for ProbDouble, but not ProbTriple (the scaling is wrong for ProbTriple). Double:=proc(L,A) local elt,F,num,J,K,S: F:=table([seq( seq( (A[J],A[K]) = 0 ,K=1..nops(A)),J=1..nops(A))]); for elt in L do for num from 1 to nops(elt)-1 do F[(elt[num],elt[num+1])]:=F[(elt[num],elt[num+1])]+1; od: od: S:=[seq( max(1, add(F[A[J],A[K]],K=1..nops(A)) ),J=1..26)]; [seq([seq(F[A[J],A[K]]/S[J], K=1..nops(A))], J=1..nops(A))]; end: ################################################################## #Useful for ProbTripleGJ (and other triple letter programs) ################################################################## #DoubleOverall(L,A) is like Double, but scaled differently. The total sum over all of the first and second letters is 1. #This can be used for the digraph probabilities in ProbTriple GJ, but not in ProbDoubleGJ. DoubleOverall:=proc(L,A) local elt,F,num,J,K,S: F:=table([seq( seq( (A[J],A[K]) = 0 ,K=1..nops(A)),J=1..nops(A))]); S:=0; for elt in L do for num from 1 to nops(elt)-2 do F[(elt[num],elt[num+1])]:=F[(elt[num],elt[num+1])]+1; S:=S+1; od: od: S:=S; [seq([seq(F[A[J],A[K]]/S, K=1..nops(A))], J=1..nops(A))]; end: ################################################################## #Triple(L,A)[i][j][k] is the probability of finding the letter triple "ith letter of A, jth letter of A, kth letter of A", given an occurrence of "ith letter of A, jth letter of A". This is appropriate for finding the triple letter frequencies for ProbTriple. Triple:=proc(L,A) local elt,F,H,J,K,num,S: F:=table([seq( seq( seq( (A[H],A[J],A[K])=0, H=1..nops(A)), J=1..nops(A)), K=1..nops(A))]); for elt in L do for num from 1 to nops(elt)-2 do F[(elt[num],elt[num+1],elt[num+2])]:= F[(elt[num],elt[num+1],elt[num+2])]+1; od: od: S:=[seq([ seq( max(1,add(F[A[H],A[J],A[K]], K=1..nops(A))),J=1..nops(A))],H=1..nops(A))]; [seq([seq([ seq( F[A[H],A[J],A[K]]/ (S[H][J]),K=1..nops(A)) ],J=1..nops(A))],H=1..nops(A))]; end: ################################################################# # When we want to model passages of English text, not just words. ################################################################## #PassageDouble(L,A,SP) returns a table of double letter frequencies in the alphabet A union {SP} (SP is a character corresponding to a blank space), suitable to be used in the English Language example of our paper. PassageDouble:=proc(L,A,SP) local F,S,elt,num: F:=table([seq( seq( (A[J],A[K]) = 0 ,K=1..nops(A)),J=1..nops(A)), seq((SP,A[N])=0,N=1..nops(A) ), seq( (A[M],SP)=0, M=1..nops(A))]); for elt in L do #Counts the letter pairs, not at the end of a word for num from 1 to nops(elt)-1 do F[(elt[num],elt[num+1])]:=F[(elt[num],elt[num+1])]+1; od: #Counts the letter pairs that correspond to the beginning and ends of a word as (SP,letter) and (letter,SP). F[(SP,elt[1])]:= F[(SP,elt[1])]+1; F[(elt[nops(elt)], SP)]:=F[(elt[nops(elt)], SP)]+1; od: #Gives us the scaling factor, so that the sum over all the terms with some fixed first letter is one. S:=[seq( max(1, add(F[A[J],A[K]],K=1..nops(A))+F[A[J],SP]),J=1..nops(A)),add(F[SP,A[M]], M=1..nops(A))]; [seq( [seq( F[A[J],A[K]], K=1..nops(A)), F[A[J],SP]]/S[J], J=1..nops(A)), [seq(F[SP,A[M]]/S[nops(A)+1], M=1..nops(A)),0]]; end: