Skip to content

Unix Filesystem (Advanced)

Advanced filesystem topics: atomicity guarantees, advisory and byte-range locking, change-detection APIs, performance tuning, and reusable patterns for safe concurrent file access.

OperationAtomic?Notes
rename() same fsYesAtomically replaces destination
rename() cross-fsNoFails with EXDEV
link() hard linkYesCreates link atomically
symlink()YesCreates symlink atomically
open(O_CREAT | O_EXCL)YesAtomic create-if-not-exists
write() small~YesAtomic up to PIPE_BUF (512-4096 bytes)
write() largeNoMay interleave with concurrent writes
mkdir()YesCreates directory atomically
unlink()YesRemoves link atomically
O_APPEND writesYesSeek-and-write is atomic
Metadata + data writeNoUse fsync() for durability, not atomicity

Never write directly to the target file. Write to temp, then rename:

Terminal window
# Shell: atomic config update
tmp=$(mktemp /etc/config.XXXXXX)
echo "new content" > "$tmp"
chmod 644 "$tmp"
mv "$tmp" /etc/config # Atomic replacement
# Python: atomic file write
import os
import tempfile
def atomic_write(path, content):
dir_name = os.path.dirname(path)
fd, tmp_path = tempfile.mkstemp(dir=dir_name)
try:
os.write(fd, content.encode())
os.fsync(fd) # Flush to disk
os.close(fd)
os.rename(tmp_path, path) # Atomic
except:
os.close(fd)
os.unlink(tmp_path)
raise
// Go: atomic file write
func atomicWrite(path string, data []byte, perm os.FileMode) error {
dir := filepath.Dir(path)
f, err := os.CreateTemp(dir, ".tmp")
if err != nil {
return err
}
tmpPath := f.Name()
defer os.Remove(tmpPath) // Clean up on failure
if _, err := f.Write(data); err != nil {
f.Close()
return err
}
if err := f.Sync(); err != nil { // fsync
f.Close()
return err
}
if err := f.Close(); err != nil {
return err
}
if err := os.Chmod(tmpPath, perm); err != nil {
return err
}
return os.Rename(tmpPath, path) // Atomic
}

Extended rename with atomicity flags:

FlagBehavior
RENAME_NOREPLACEFail if destination exists (atomic check)
RENAME_EXCHANGEAtomically swap two files
// Atomic swap of two files (Linux only)
#include <linux/fs.h>
renameat2(AT_FDCWD, "file_a", AT_FDCWD, "file_b", RENAME_EXCHANGE);
TypeEnforced byUse case
AdvisoryConventionCooperating processes
MandatoryKernelRare, disabled on most systems

Advisory locks only work if all processes agree to check them.

#include <sys/file.h>
int fd = open("file", O_RDWR);
flock(fd, LOCK_EX); // Exclusive lock (blocks)
flock(fd, LOCK_SH); // Shared lock (blocks)
flock(fd, LOCK_EX | LOCK_NB); // Non-blocking
flock(fd, LOCK_UN); // Unlock
FlagMeaning
LOCK_SHShared (read) lock
LOCK_EXExclusive (write) lock
LOCK_NBNon-blocking (return EWOULDBLOCK)
LOCK_UNRelease lock

Key behaviors:

  • Locks are on open file descriptions, not file descriptors
  • Inherited across fork() (both processes share the lock)
  • Released when all FDs to that description close
  • Not released on exec() unless O_CLOEXEC
#include <fcntl.h>
struct flock fl = {
.l_type = F_WRLCK, // F_RDLCK, F_WRLCK, F_UNLCK
.l_whence = SEEK_SET,
.l_start = 0,
.l_len = 0 // 0 = entire file
};
fcntl(fd, F_SETLK, &fl); // Non-blocking
fcntl(fd, F_SETLKW, &fl); // Blocking
fcntl(fd, F_GETLK, &fl); // Test lock
Lock typeMeaning
F_RDLCKShared (read) lock
F_WRLCKExclusive (write)
F_UNLCKRelease lock

Key behaviors:

  • Locks per process, not per FD (closing any FD releases all locks)
  • Works over NFS (unlike flock on older kernels)
  • Can lock byte ranges within a file

For process exclusion (daemons, scripts):

Terminal window
# Shell: lock file with flock
exec 200>/var/run/myapp.lock
flock -n 200 || { echo "Already running"; exit 1; }
# ... do work ...
# Python: lock file
import fcntl
import os
import sys
def acquire_lock(path):
fd = os.open(path, os.O_CREAT | os.O_RDWR)
try:
fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
return fd
except BlockingIOError:
os.close(fd)
return None
lock_fd = acquire_lock('/var/run/myapp.lock')
if lock_fd is None:
sys.exit("Already running")
# Lock released when process exits
Terminal window
# Write PID atomically
echo $$ > /var/run/myapp.pid.tmp
mv /var/run/myapp.pid.tmp /var/run/myapp.pid
# Check if process is running
if [ -f /var/run/myapp.pid ]; then
pid=$(cat /var/run/myapp.pid)
if kill -0 "$pid" 2>/dev/null; then
echo "Running as $pid"
else
echo "Stale PID file"
fi
fi
Featureflockfcntl
GranularityWhole fileByte ranges
NFS supportLinux 2.6.12+Yes
Release on closeAll FDs closedAny FD closed
Inherited on forkYes (shared)No (per-process)
POSIX standardNo (BSD)Yes
PlatformAPINotes
LinuxinotifyFile/directory level, recursive needs work
macOSFSEventsDirectory level, coalesced events
BSDkqueueFile descriptor based, very flexible
WindowsReadDirectoryChangesWDirectory based
#include <sys/inotify.h>
int fd = inotify_init1(IN_NONBLOCK);
int wd = inotify_add_watch(fd, "/path",
IN_CREATE | IN_DELETE | IN_MODIFY | IN_MOVE);
// Read events
char buf[4096];
ssize_t len = read(fd, buf, sizeof(buf));
struct inotify_event *event = (struct inotify_event *)buf;
EventMeaning
IN_ACCESSFile accessed (read)
IN_MODIFYFile modified
IN_ATTRIBMetadata changed
IN_CLOSE_WRITEWritable file closed
IN_CLOSE_NOWRITENon-writable file closed
IN_OPENFile opened
IN_MOVED_FROMFile moved out of watched dir
IN_MOVED_TOFile moved into watched dir
IN_CREATEFile/dir created
IN_DELETEFile/dir deleted
IN_DELETE_SELFWatched item itself deleted
IN_MOVE_SELFWatched item itself moved

Limitations:

  • Not recursive (must add watches to subdirectories manually)
  • Events can be coalesced or lost under load (IN_Q_OVERFLOW)
  • Race condition: files may change before you can read them
  • Doesn’t work on network filesystems (NFS, CIFS)
#include <sys/event.h>
int kq = kqueue();
int fd = open("/path/to/file", O_RDONLY);
struct kevent change;
EV_SET(&change, fd, EVFILT_VNODE,
EV_ADD | EV_CLEAR,
NOTE_WRITE | NOTE_DELETE | NOTE_RENAME,
0, NULL);
kevent(kq, &change, 1, NULL, 0, NULL);
// Wait for events
struct kevent event;
int n = kevent(kq, NULL, 0, &event, 1, NULL);
Filter flagMeaning
NOTE_DELETEFile deleted
NOTE_WRITEFile modified
NOTE_EXTENDFile extended
NOTE_ATTRIBAttributes changed
NOTE_LINKLink count changed
NOTE_RENAMEFile renamed
NOTE_REVOKEAccess revoked

Advantages over inotify:

  • Unified API for files, sockets, processes, signals, timers
  • Works on file descriptor (survives rename)
  • Per-event EV_CLEAR for edge-triggered behavior
#include <CoreServices/CoreServices.h>
void callback(ConstFSEventStreamRef stream,
void *info,
size_t numEvents,
void *eventPaths,
const FSEventStreamEventFlags flags[],
const FSEventStreamEventId ids[]) {
char **paths = eventPaths;
for (size_t i = 0; i < numEvents; i++) {
printf("Changed: %s
", paths[i]);
}
}
CFStringRef path = CFSTR("/path/to/watch");
CFArrayRef paths = CFArrayCreate(NULL, (const void **)&path, 1, NULL);
FSEventStreamRef stream = FSEventStreamCreate(
NULL, &callback, NULL, paths,
kFSEventStreamEventIdSinceNow,
1.0, // Latency in seconds
kFSEventStreamCreateFlagFileEvents
);
FSEventStreamScheduleWithRunLoop(stream, CFRunLoopGetCurrent(),
kCFRunLoopDefaultMode);
FSEventStreamStart(stream);
CFRunLoopRun();
FlagMeaning
kFSEventStreamCreateFlagFileEventsFile-level events (not just dirs)
kFSEventStreamCreateFlagNoDeferDon’t batch events
LibraryLanguagePlatforms
watchdogPythonLinux, macOS, Windows
notifyRustLinux, macOS, Windows
fsnotifyGoLinux, macOS, Windows
chokidarNode.jsLinux, macOS, Windows
# Python watchdog example
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
class Handler(FileSystemEventHandler):
def on_modified(self, event):
print(f"Modified: {event.src_path}")
observer = Observer()
observer.schedule(Handler(), "/path/to/watch", recursive=True)
observer.start()
AspectMany small filesFew large files
Metadata opsHigh overheadLow overhead
Directory scansSlowFast
Inode usageCan exhaust inodesMinimal
FragmentationHigherLower (usually)
Backup/syncSlowerFaster

Rule of thumb: If files are < 4KB, consider combining them.

Terminal window
# Bad: spawns find subprocess for each dir
for d in */; do find "$d" -name "*.log"; done
# Good: single find invocation
find . -name "*.log"
# Best: parallel with fd (if available)
fd -e log

Optimization strategies:

  1. Avoid stat() calls when possible - ls -1 vs ls -l
  2. Use openat/fstatat - reduces path resolution overhead
  3. Read directories in batches - getdents64 syscall
  4. Sort by inode for sequential disk access (HDD only)
Cache levelSizeLatency
Page cacheRAM-based~100ns
Disk cache~64-256MB~1ms (SSD)
Cold disk-~10ms (SSD)
Terminal window
# Drop caches (Linux, requires root)
echo 3 > /proc/sys/vm/drop_caches
# Check cache hit ratio
cat /proc/meminfo | grep -E "Cached|Buffers"
# Force bypass page cache
dd if=/dev/sda of=/dev/null bs=1M iflag=direct
int fd = open("file", O_RDWR | O_DIRECT);
// Buffer must be aligned (typically 512 or 4096 bytes)
void *buf;
posix_memalign(&buf, 4096, 4096);
read(fd, buf, 4096);

When to use O_DIRECT:

  • Application does its own caching (databases)
  • Very large files that won’t benefit from cache
  • Avoiding double-buffering
FunctionGuarantees
fsync(fd)File data + metadata flushed to disk
fdatasync()File data flushed (metadata may lag)
sync()Flush all buffers (system-wide)
O_SYNCEvery write is synchronous
O_DSYNCData sync on each write
Terminal window
# Ensure write durability
echo "data" > file
sync # System-wide, sledgehammer
# Better: fsync specific file
python -c "import os; f=open('file','a'); f.flush(); os.fsync(f.fileno())"
#!/bin/bash
config="/etc/myapp/config"
new_config=$(mktemp "${config}.XXXXXX")
# Write new config
cat > "$new_config" << 'EOF'
key=value
EOF
# Preserve permissions
chmod --reference="$config" "$new_config" 2>/dev/null || chmod 644 "$new_config"
# Atomic replacement
mv "$new_config" "$config"
import os
import fcntl
def increment_counter(path):
fd = os.open(path, os.O_RDWR | os.O_CREAT)
try:
fcntl.flock(fd, fcntl.LOCK_EX)
data = os.read(fd, 100) or b'0'
count = int(data.strip() or 0) + 1
os.ftruncate(fd, 0)
os.lseek(fd, 0, os.SEEK_SET)
os.write(fd, str(count).encode())
os.fsync(fd)
return count
finally:
fcntl.flock(fd, fcntl.LOCK_UN)
os.close(fd)
Terminal window
# Using inotifywait (Linux)
inotifywait -m -e create /path/to/dir |
while read dir action file; do
echo "New file: $file"
done
# Using fswatch (macOS/Linux)
fswatch -0 /path/to/dir | xargs -0 -n1 echo
# Polling fallback (portable)
while true; do
ls -1 /path/to/dir > /tmp/current
diff /tmp/previous /tmp/current 2>/dev/null | grep "^>"
mv /tmp/current /tmp/previous
sleep 1
done
ErrorMeaning
ENOENTFile doesn’t exist
EEXISTFile already exists
EACCESPermission denied
EBUSYResource busy (mounted, locked)
EXDEVCross-device link (rename across fs)
ENOSPCNo space left
ENOLCKNo locks available
EWOULDBLOCKWould block (non-blocking op)
ESTALEStale NFS file handle
SyscallPurpose
open()Open/create file
close()Close descriptor
read()Read bytes
write()Write bytes
rename()Atomic rename
unlink()Remove file
link()Create hard link
symlink()Create symbolic link
stat()Get file metadata
fstat()Get metadata via FD
fsync()Flush to disk
flock()Advisory file lock
fcntl()File control (locks, flags)
  • Filesystem - Core concepts: FHS, inodes, permissions, links
  • Unix - General shell commands
  • Shell - Scripting patterns
  • Performance - Profiling and optimization