mirror of
git://git.sv.gnu.org/coreutils.git
synced 2026-04-12 06:57:33 +02:00
rm -r: avoid O(n^2) performance for a directory with very many entries
This enhancement works around a problem that is specific to at least ext3 and ext4 file systems. With them, it would take hours to remove a two-million-entry directory. RAM-backed file systems (tmpfs) are not affected, since there is no seek penalty. * remove.c (rm_malloc, rm_free, compare_ino): New functions. (dirent_count, preprocess_dir): New function. [struct readdir_data]: New struct. (remove_cwd_entries): Call preprocess_dir. * tests/rm/ext3-perf: New file. Test for the performance fix. * NEWS: mention the new feature
This commit is contained in:
7
NEWS
7
NEWS
@@ -9,6 +9,13 @@ GNU coreutils NEWS -*- outline -*-
|
||||
|
||||
** New features
|
||||
|
||||
chgrp, chmod, chown, chcon, du, rm: now all display linear performance,
|
||||
even when operating on million-entry directories on ext3 and ext4 file
|
||||
systems. Before, they would exhibit O(N^2) performance, due to linear
|
||||
per-entry seek time cost when operating on entries in readdir order.
|
||||
Rm was improved directly, while the others inherit the improvement
|
||||
from the newer version of fts in gnulib.
|
||||
|
||||
comm now verifies that the inputs are in sorted order. This check can
|
||||
be turned off with the --nocheck-order option.
|
||||
|
||||
|
||||
165
src/remove.c
165
src/remove.c
@@ -60,6 +60,14 @@ enum
|
||||
CONSECUTIVE_READDIR_UNLINK_THRESHOLD = 10
|
||||
};
|
||||
|
||||
/* If the heuristics in preprocess_dir suggest that there
|
||||
are fewer than this many entries in a directory, then it
|
||||
skips the preprocessing altogether. */
|
||||
enum
|
||||
{
|
||||
INODE_SORT_DIR_ENTRIES_THRESHOLD = 10000
|
||||
};
|
||||
|
||||
/* FIXME: in 2009, or whenever Darwin 7.9.0 (aka MacOS X 10.3.9) is no
|
||||
longer relevant, remove this work-around code. Then, there will be
|
||||
no need to perform the extra rewinddir call, ever. */
|
||||
@@ -217,6 +225,24 @@ hash_compare_strings (void const *x, void const *y)
|
||||
return STREQ (x, y) ? true : false;
|
||||
}
|
||||
|
||||
/* Obstack allocator: longjump on failure. */
|
||||
static void *
|
||||
rm_malloc (void *v_jumpbuf, long size)
|
||||
{
|
||||
jmp_buf *jumpbuf = v_jumpbuf;
|
||||
void *p = malloc (size);
|
||||
if (p)
|
||||
return p;
|
||||
longjmp (*jumpbuf, 1);
|
||||
}
|
||||
|
||||
/* With the 2-arg allocator, we must also provide a two-argument freer. */
|
||||
static void
|
||||
rm_free (void *v_jumpbuf ATTRIBUTE_UNUSED, void *ptr)
|
||||
{
|
||||
free (ptr);
|
||||
}
|
||||
|
||||
static inline void
|
||||
push_dir (Dirstack_state *ds, const char *dir_name)
|
||||
{
|
||||
@@ -1225,6 +1251,141 @@ fd_to_subdirp (int fd_cwd, char const *f,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct readdir_data
|
||||
{
|
||||
ino_t ino;
|
||||
char name[FLEXIBLE_ARRAY_MEMBER];
|
||||
};
|
||||
|
||||
/* A comparison function to sort on increasing inode number. */
|
||||
static int
|
||||
compare_ino (void const *av, void const *bv)
|
||||
{
|
||||
struct readdir_data const *const *a = av;
|
||||
struct readdir_data const *const *b = bv;
|
||||
return (a[0]->ino < b[0]->ino ? -1
|
||||
: b[0]->ino < a[0]->ino ? 1 : 0);
|
||||
}
|
||||
|
||||
/* Return an approximation of the maximum number of dirent entries
|
||||
in a directory with stat data *ST. */
|
||||
static size_t
|
||||
dirent_count (struct stat const *st)
|
||||
{
|
||||
return st->st_size / 16;
|
||||
}
|
||||
|
||||
/* When a directory contains very many entries, operating on N entries in
|
||||
readdir order can be very seek-intensive (be it to unlink or even to
|
||||
merely stat each entry), to the point that it results in O(N^2) work.
|
||||
This is file-system-specific: ext3 and ext4 (as of 2008) are susceptible,
|
||||
but tmpfs is not. The general solution is to process entries in inode
|
||||
order. That means reading all entries, and sorting them before operating
|
||||
on any. As such, it is useful only on systems with useful dirent.d_ino.
|
||||
Since 'rm -r's removal process must traverse into directories and since
|
||||
this preprocessing phase can allocate O(N) storage, here we store and
|
||||
sort only non-directory entries, and then remove all of those, so that we
|
||||
can free all allocated storage before traversing into any subdirectory.
|
||||
Perform this optimization only when not interactive and not in verbose
|
||||
mode, to keep the implementation simple and to minimize duplication.
|
||||
Upon failure, simply free any resources and return. */
|
||||
static void
|
||||
preprocess_dir (DIR **dirp, struct rm_options const *x)
|
||||
{
|
||||
#if HAVE_STRUCT_DIRENT_D_TYPE
|
||||
|
||||
struct stat st;
|
||||
/* There are many reasons to return right away, skipping this
|
||||
optimization. The penalty for being wrong is that we will
|
||||
perform a small amount of extra work.
|
||||
|
||||
Skip this optimization if... */
|
||||
|
||||
if (/* - there is a chance of interactivity */
|
||||
x->interactive != RMI_NEVER
|
||||
|
||||
/* - we're in verbose mode */
|
||||
|| x->verbose
|
||||
|
||||
/* - privileged users can unlink nonempty directories.
|
||||
Otherwise, there'd be a race condition between the readdir
|
||||
call (in which we learn dirent.d_type) and the unlink, by
|
||||
which time the non-directory may be replaced with a directory. */
|
||||
|| ! cannot_unlink_dir ()
|
||||
|
||||
/* - we can't fstat the file descriptor */
|
||||
|| fstat (dirfd (*dirp), &st) != 0
|
||||
|
||||
/* - the directory is smaller than some threshold.
|
||||
Estimate the number of inodes with a heuristic.
|
||||
There's no significant benefit to sorting if there are
|
||||
too few inodes. */
|
||||
|| dirent_count (&st) < INODE_SORT_DIR_ENTRIES_THRESHOLD)
|
||||
return;
|
||||
|
||||
/* FIXME: maybe test file system type, too; skip if it's tmpfs: see fts.c */
|
||||
|
||||
struct obstack o_readdir_data; /* readdir data: inode,name pairs */
|
||||
struct obstack o_p; /* an array of pointers to each inode,name pair */
|
||||
|
||||
/* Arrange to longjmp upon obstack allocation failure. */
|
||||
jmp_buf readdir_jumpbuf;
|
||||
if (setjmp (readdir_jumpbuf))
|
||||
goto cleanup;
|
||||
|
||||
obstack_init_minimal (&o_readdir_data);
|
||||
obstack_init_minimal (&o_p);
|
||||
|
||||
obstack_specify_allocation_with_arg (&o_readdir_data, 0, 0,
|
||||
rm_malloc, rm_free, &readdir_jumpbuf);
|
||||
obstack_specify_allocation_with_arg (&o_p, 0, 0,
|
||||
rm_malloc, rm_free, &readdir_jumpbuf);
|
||||
|
||||
/* Read all entries, storing <d_ino, d_name> for each non-dir one.
|
||||
Maintain a parallel list of pointers into the primary buffer. */
|
||||
while (1)
|
||||
{
|
||||
struct dirent const *dp;
|
||||
dp = readdir_ignoring_dot_and_dotdot (*dirp);
|
||||
/* no need to distinguish EOF from failure */
|
||||
if (dp == NULL)
|
||||
break;
|
||||
|
||||
/* Skip known-directory and type-unknown entries. */
|
||||
if (D_TYPE (dp) == DT_UNKNOWN || D_TYPE (dp) == DT_DIR)
|
||||
break;
|
||||
|
||||
size_t name_len = strlen (dp->d_name);
|
||||
size_t ent_len = offsetof (struct readdir_data, name) + name_len + 1;
|
||||
struct readdir_data *v = obstack_alloc (&o_readdir_data, ent_len);
|
||||
v->ino = D_INO (dp);
|
||||
memcpy (v->name, dp->d_name, name_len + 1);
|
||||
|
||||
/* Append V to the list of pointers. */
|
||||
obstack_ptr_grow (&o_p, v);
|
||||
}
|
||||
|
||||
/* Compute size and finalize VV. */
|
||||
size_t n = obstack_object_size (&o_p) / sizeof (void *);
|
||||
struct readdir_data **vv = obstack_finish (&o_p);
|
||||
|
||||
/* Sort on inode number. */
|
||||
qsort(vv, n, sizeof *vv, compare_ino);
|
||||
|
||||
/* Iterate through those pointers, unlinking each name. */
|
||||
for (size_t i = 0; i < n; i++)
|
||||
{
|
||||
/* ignore failure */
|
||||
(void) unlinkat (dirfd (*dirp), vv[i]->name, 0);
|
||||
}
|
||||
|
||||
cleanup:
|
||||
obstack_free (&o_readdir_data, NULL);
|
||||
obstack_free (&o_p, NULL);
|
||||
rewinddir (*dirp);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Remove entries in the directory open on DIRP
|
||||
Upon finding a directory that is both non-empty and that can be chdir'd
|
||||
into, return RM_OK and set *SUBDIR and fill in SUBDIR_SB, where
|
||||
@@ -1247,6 +1408,10 @@ remove_cwd_entries (DIR **dirp,
|
||||
assert (VALID_STATUS (status));
|
||||
*subdir = NULL;
|
||||
|
||||
/* This is just an optimization.
|
||||
It's not a fatal problem if it fails. */
|
||||
preprocess_dir (dirp, x);
|
||||
|
||||
while (1)
|
||||
{
|
||||
struct dirent const *dp;
|
||||
|
||||
@@ -72,6 +72,7 @@ EXTRA_DIST += $(TESTS)
|
||||
TESTS = \
|
||||
misc/help-version \
|
||||
misc/invalid-opt \
|
||||
rm/ext3-perf \
|
||||
rm/cycle \
|
||||
chmod/no-x \
|
||||
chgrp/basic \
|
||||
|
||||
76
tests/rm/ext3-perf
Executable file
76
tests/rm/ext3-perf
Executable file
@@ -0,0 +1,76 @@
|
||||
#!/bin/sh
|
||||
# ensure that "rm -rf DIR-with-many-entries" is not O(N^2)
|
||||
|
||||
# Copyright (C) 2008 Free Software Foundation, Inc.
|
||||
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
if test "$VERBOSE" = yes; then
|
||||
set -x
|
||||
rm --version
|
||||
fi
|
||||
|
||||
. $srcdir/test-lib.sh
|
||||
|
||||
expensive_
|
||||
|
||||
# Using rm -rf to remove a 400k-entry directory takes:
|
||||
# - 9 seconds with the patch, on a 2-yr-old system
|
||||
# - 350 seconds without the patch, on a high-end system (disk 20-30% faster)
|
||||
threshold_seconds=60
|
||||
|
||||
# The number of entries in our test directory.
|
||||
n=400000
|
||||
|
||||
# Choose a value that is large enough to ensure an accidentally
|
||||
# regressed rm would require much longer than $threshold_seconds to remove
|
||||
# the directory. With n=400k, pre-patch GNU rm would require about 350
|
||||
# seconds even on a fast disk. On a relatively modern system, the
|
||||
# patched version of rm requires about 10 seconds, so even if you
|
||||
# choose to enable very expensive tests with a disk that is much slower,
|
||||
# the test should still succeed.
|
||||
|
||||
# Skip unless "." is on an ext[34] file system.
|
||||
# FIXME-maybe: try to find a suitable file system or allow
|
||||
# the user to specify it via an envvar.
|
||||
df -t ext3 -t ext4dev -t ext4 . \
|
||||
|| skip_test_ 'this test runs only on an ext3 or ext4 file system'
|
||||
|
||||
# Skip if there are too few inodes free. Require some slack.
|
||||
free_inodes=$(stat -f --format=%d .) || framework_failure
|
||||
min_free_inodes=$(expr 12 \* $n / 10)
|
||||
test $min_free_inodes -lt $free_inodes \
|
||||
|| skip_test_ "too few free inodes on '.': $free_inodes;" \
|
||||
"this test requires at least $min_free_inodes"
|
||||
|
||||
ok=0
|
||||
mkdir d &&
|
||||
cd d &&
|
||||
seq $n | xargs touch &&
|
||||
test -f 1 &&
|
||||
test -f $n &&
|
||||
cd .. &&
|
||||
ok=1
|
||||
test $ok = 1 || framework_failure
|
||||
|
||||
fail=0
|
||||
start=$(date +%s)
|
||||
rm -rf d || fail=1
|
||||
duration=$(expr $(date +%s) - $start)
|
||||
test $duration -lt $threshold_seconds ||
|
||||
{ fail=1; echo rm took longer than $threshold_seconds seconds; }
|
||||
|
||||
echo removing a $n-entry directory took $duration seconds
|
||||
|
||||
Exit $fail
|
||||
Reference in New Issue
Block a user