@article {Ezawa023598, author = {Kiyoshi Ezawa and Dan Graur and Giddy Landan}, title = {Perturbative formulation of general continuous-time Markov model of sequence evolution via insertions/deletions, Part I: Theoretical basis}, elocation-id = {023598}, year = {2016}, doi = {10.1101/023598}, publisher = {Cold Spring Harbor Laboratory}, abstract = {Background Insertions and deletions (indels) account for more nucleotide differences between two related DNA sequences than substitutions do, and thus it is imperative to develop a stochastic evolutionary model that enables us to reliably calculate the probability of the sequence evolution through indel processes. Recently, such probabilistic models are mostly based on either hidden Markov models (HMMs) or transducer theories, both of which give the indel component of the probability of a given sequence alignment as a product of either probabilities of column-to-column transitions or block-wise contributions along the alignment. However, it is not a priori clear how these models are related with any genuine stochastic evolutionary model, which describes the stochastic evolution of an entire sequence along the time-axis. Moreover, none of these models can fully accommodate biologically realistic features, such as overlapping indels, power-law indel-length distributions, and indel rate variation across regions.Results Here, we theoretically tackle the ab initio calculation of the probability of a given sequence alignment under a genuine evolutionary model, more specifically, a general continuous-time Markov model of the evolution of an entire sequence via insertions and deletions. Our model allows general indel rate parameters including length distributions but does not impose any unrealistic restrictions on indels. Using techniques of the perturbation theory in physics, we expand the probability into a series over different numbers of indels. Our derivation of this perturbation expansion elegantly bridges the gap between Gillespie{\textquoteright}s (1977) intuitive derivation of his own stochastic simulation method, which is now widely used in evolutionary simulators, and Feller{\textquoteright}s (1940) mathematically rigorous theorems that underpin Gillespie's method. We find a sufficient and nearly necessary set of conditions under which the probability can be expressed as the product of an overall factor and the contributions from regions separated by gapless columns of the alignment. The indel models satisfying these conditions include those with some kind of rate variation across regions, as well as space-homogeneous models. We also prove that, though with a caveat, pairwise probabilities calculated by the method of Mikl{\'o}s et al. (2004) are equivalent to those calculated by our ab initio formulation, at least under a space-homogenous model.Conclusions Our ab initio perturbative formulation provides a firm theoretical ground that other indel models can rest on.[This paper and three other papers (Ezawa, Graur and Landan 2015a,b,c) describe a series of our efforts to develop, apply, and extend the ab initio perturbative formulation of a general continuous-time Markov model of indels.]}, URL = {https://www.biorxiv.org/content/early/2016/01/31/023598}, eprint = {https://www.biorxiv.org/content/early/2016/01/31/023598.full.pdf}, journal = {bioRxiv} }