Source code for dials.algorithms.indexing.nearest_neighbor
from __future__ import annotations
import math
[docs]
class NeighborAnalysis:
[docs]
def __init__(
self,
reflections,
step_size=45,
tolerance=1.5,
max_height_fraction=0.25,
percentile=None,
histogram_binning="linear",
nn_per_bin=5,
convert_reflections_z_to_deg=True,
):
self.tolerance = tolerance # Margin of error for max unit cell estimate
from scitbx.array_family import flex
NEAR = 10
self.NNBIN = nn_per_bin # target number of neighbors per histogram bin
self.histogram_binning = histogram_binning
direct = flex.double()
if "entering" in reflections:
entering_flags = reflections["entering"]
else:
entering_flags = flex.bool(reflections.size(), True)
rs_vectors = reflections["rlp"]
z = reflections["xyzobs.mm.value"].parts()[2]
if convert_reflections_z_to_deg:
z = z * (180 / math.pi)
d_spacings = flex.double()
# nearest neighbor analysis
from annlib_ext import AnnAdaptor
for imageset_id in range(flex.max(reflections["imageset_id"]) + 1):
sel_imageset = reflections["imageset_id"] == imageset_id
if sel_imageset.count(True) == 0:
continue
z_min = flex.min(z.select(sel_imageset))
z_max = flex.max(z.select(sel_imageset))
d_z = z_max - z_min
n_steps = max(int(math.ceil(d_z / step_size)), 1)
for n in range(n_steps):
sel_step = (
sel_imageset
& (z >= (z_min + n * step_size))
& (z < (z_min + (n + 1) * step_size))
)
for entering in (True, False):
sel_entering = sel_step & (entering_flags == entering)
if sel_entering.count(True) == 0:
continue
query = flex.double()
query.extend(rs_vectors.select(sel_entering).as_double())
if query.size() == 0:
continue
IS_adapt = AnnAdaptor(data=query, dim=3, k=1)
IS_adapt.query(query)
direct.extend(1 / flex.sqrt(IS_adapt.distances))
d_spacings.extend(1 / rs_vectors.norms())
assert (
len(direct) > NEAR
), f"Too few spots ({len(direct)}) for nearest neighbour analysis."
perm = flex.sort_permutation(direct)
direct = direct.select(perm)
d_spacings = d_spacings.select(perm)
# eliminate nonsensical direct space distances
sel = direct > 1
direct = direct.select(sel)
d_spacings = d_spacings.select(sel)
if percentile is None:
# reject top 1% of longest distances to hopefully get rid of any outliers
n = int(math.floor(0.99 * len(direct)))
direct = direct[:n]
d_spacings = d_spacings[:n]
# determine the most probable nearest neighbor distance (direct space)
if self.histogram_binning == "log":
hst = flex.histogram(
flex.log10(direct), n_slots=int(len(direct) / self.NNBIN)
)
else:
hst = flex.histogram(direct, n_slots=int(len(direct) / self.NNBIN))
if self.histogram_binning == "log":
self.slot_start = flex.double(
[10 ** (s - 0.5 * hst.slot_width()) for s in hst.slot_centers()]
)
self.slot_end = flex.double(
[10 ** (s + 0.5 * hst.slot_width()) for s in hst.slot_centers()]
)
self.slot_width = self.slot_end - self.slot_start
else:
self.slot_start = hst.slot_centers() - 0.5 * hst.slot_width()
self.slot_end = hst.slot_centers() + 0.5 * hst.slot_width()
self.slot_width = hst.slot_width()
self.relative_frequency = hst.slots().as_double() / self.slot_width
highest_bin_height = flex.max(self.relative_frequency)
if percentile is not None:
# determine the nth-percentile direct-space distance
perm = flex.sort_permutation(direct, reverse=True)
self.max_cell = (
self.tolerance * direct[perm[int((1 - percentile) * len(direct))]]
)
else:
# choose a max cell based on bins above a given fraction of the highest bin height
# given multiple
isel = (
self.relative_frequency.as_double()
> (max_height_fraction * highest_bin_height)
).iselection()
self.max_cell = (
self.tolerance * self.slot_end[int(flex.max(isel.as_double()))]
)
self.reciprocal_lattice_vectors = rs_vectors
self.d_spacings = d_spacings
self.direct = direct
self.histogram = hst
[docs]
def plot_histogram(self, filename="nn_hist.png", figsize=(12, 8)):
import matplotlib.pyplot as plt
plt.figure(figsize=figsize)
plt.bar(
self.slot_start,
self.relative_frequency,
align="center",
width=self.slot_width,
color="black",
edgecolor=None,
)
ymin, ymax = plt.ylim()
if self.histogram_binning == "log":
ax = plt.gca()
ax.set_xscale("log")
plt.vlines(
self.max_cell / self.tolerance,
ymin,
ymax,
linestyles="--",
colors="g",
label="estimated max cell",
)
plt.vlines(
self.max_cell,
ymin,
ymax,
colors="g",
label="estimated max cell (including tolerance)",
)
plt.xlabel("Direct space distance (A)")
plt.ylabel("Frequency")
plt.legend(loc="upper left")
plt.savefig(filename)
plt.clf()