Replication failed 3.0.5 -> 3.0.13, now 3.0.13->3.0.13

Olaf Frączyk olaf at navi.pl
Tue Apr 21 09:21:11 EDT 2020


I managed to get strace on both sides, however it doesn't make me wiser 
- there is nothing obvious for me.

Additionally I see that replication works more or less for new messages, 
but older are not processed.

I have several subfolders in my mailbox, some of them unreplicated. If I 
change anything in the subfolder now - the folder is replicated, but 
other subfolders remain not replicated - unless I change anything in them.

Below strace, maybe someone can find anything meaningful from it:

1: On replica

rename("/var/lib/imap/proc/6794.new", "/var/lib/imap/proc/6794") = 0
getpid()                                = 6794
openat(AT_FDCWD, "/var/lib/imap/proc/6794.new", O_RDWR|O_CREAT|O_TRUNC, 
0666) = 12
fstat(12, {st_mode=S_IFREG|0600, st_size=0, ...}) = 0
write(12, "imap\tifs.local.navi.pl [192.168."..., 53) = 53
close(12)                               = 0
rename("/var/lib/imap/proc/6794.new", "/var/lib/imap/proc/6794") = 0
rt_sigprocmask(SIG_BLOCK, [INT QUIT ALRM TERM CHLD], [], 8) = 0
pselect6(1, [0], NULL, NULL, {tv_sec=1920, tv_nsec=0}, {[], 8}) = 1 (in 
[0], left {tv_sec=1919, tv_nsec=999999238})
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, [INT QUIT ALRM TERM CHLD], [], 8) = 0
pselect6(1, [0], NULL, NULL, {tv_sec=1920, tv_nsec=0}, {[], 8}) = 1 (in 
[0], left {tv_sec=1919, tv_nsec=999999439})
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
read(0, "", 5)                          = 0
unlink("/var/lib/imap/proc/6794")       = 0
openat(AT_FDCWD, "/dev/null", O_RDWR)   = 12
shutdown(0, SHUT_RD)                    = -1 ENOTCONN (Transport 
endpoint is not connected)
dup2(12, 0)                             = 0
shutdown(1, SHUT_RD)                    = -1 ENOTCONN (Transport 
endpoint is not connected)
dup2(12, 1)                             = 1
shutdown(2, SHUT_RD)                    = -1 ENOTCONN (Transport 
endpoint is not connected)
dup2(12, 2)                             = 2
close(12)                               = 0
close(11)                               = 0
getpid()                                = 6794
write(3, "\1\0\0\0\212\32\0\0", 8)      = 8
rt_sigaction(SIGALRM, {sa_handler=0x7f57e06da710, sa_mask=[], 
sa_flags=SA_RESTORER|SA_RESETHAND|SA_SIGINFO, 
sa_restorer=0x7f57dc71a960}, NULL, 8) = 0
rt_sigaction(SIGQUIT, {sa_handler=0x7f57e06da710, sa_mask=[], 
sa_flags=SA_RESTORER|SA_RESETHAND|SA_SIGINFO, 
sa_restorer=0x7f57dc71a960}, NULL, 8) = 0
rt_sigaction(SIGINT, {sa_handler=0x7f57e06da710, sa_mask=[], 
sa_flags=SA_RESTORER|SA_RESETHAND|SA_SIGINFO, 
sa_restorer=0x7f57dc71a960}, NULL, 8) = 0
rt_sigaction(SIGTERM, {sa_handler=0x7f57e06da710, sa_mask=[], 
sa_flags=SA_RESTORER|SA_RESETHAND|SA_SIGINFO, 
sa_restorer=0x7f57dc71a960}, NULL, 8) = 0
rt_sigaction(SIGUSR2, {sa_handler=0x7f57e06da710, sa_mask=[], 
sa_flags=SA_RESTORER|SA_RESETHAND|SA_SIGINFO, 
sa_restorer=0x7f57dc71a960}, NULL, 8) = 0
rt_sigaction(SIGHUP, {sa_handler=0x7f57e06da710, sa_mask=[], 
sa_flags=SA_RESTORER|SA_RESTART|SA_SIGINFO, sa_restorer=0x7f57dc71a960}, 
NULL, 8) = 0
alarm(83)                               = 0
fcntl(8, F_SETLKW, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, 
l_len=0}) = 0
stat("/usr/local/cyrus-3.0.13/libexec/imapd", {st_mode=S_IFREG|0755, 
st_size=1181120, ...}) = 0
rt_sigaction(SIGHUP, {sa_handler=0x7f57e06da710, sa_mask=[], 
sa_flags=SA_RESTORER|SA_SIGINFO, sa_restorer=0x7f57dc71a960}, NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, [INT QUIT ALRM TERM CHLD], [], 8) = 0
pselect6(5, [4], NULL, NULL, NULL, {[], 8}) = ? ERESTARTNOHAND (To be 
restarted if no handler)
--- SIGALRM {si_signo=SIGALRM, si_code=SI_KERNEL} ---
rt_sigreturn({mask=[INT QUIT ALRM TERM CHLD]}) = -1 EINTR (Interrupted 
system call)
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
rt_sigaction(SIGHUP, {sa_handler=0x7f57e06da710, sa_mask=[], 
sa_flags=SA_RESTORER|SA_RESTART|SA_SIGINFO, sa_restorer=0x7f57dc71a960}, 
NULL, 8) = 0
fcntl(8, F_SETLKW, {l_type=F_UNLCK, l_whence=SEEK_SET, l_start=0, 
l_len=0}) = 0
close(5)                                = 0
munmap(0x7f57e14c1000, 16384)           = 0
close(7)                                = 0
munmap(0x7f57e14bd000, 16384)           = 0
close(6)                                = 0
unlink("/var/lib/imap/socket/idle.6794") = 0
close(9)                                = 0
munmap(0x7f57e14a9000, 16384)           = 0
exit_group(0)                           = ?
+++ exited with 0 +++


2. On master:

nanosleep({1, 0}, 0x7ffe12318060)       = 0
stat("/var/local/imapd_sync_stop", 0x7ffe12318280) = -1 ENOENT (No such 
file or directory)
stat("/var/lib/imap/sync/log-run", 0x7ffe12318190) = -1 ENOENT (No such 
file or directory)
stat("/var/lib/imap/sync/log", {st_mode=S_IFREG|0600, st_size=26, ...}) = 0
rename("/var/lib/imap/sync/log", "/var/lib/imap/sync/log-run") = 0
open("/var/lib/imap/sync/log-run", O_RDWR) = 8
fcntl(8, F_SETLKW, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, 
l_len=0}) = 0
fcntl(8, F_SETLKW, {l_type=F_UNLCK, l_whence=SEEK_SET, l_start=0, 
l_len=0}) = 0
read(8, "MAILBOX navi.pl!user.olaf\n", 4096) = 26
read(8, "", 4096)                       = 0
write(6, "\27\3\3\0B\372\222\265gb\346\246$\211 
\367e\241\212\30\364\345[GJ\204\225\n\255?\356\343"..., 71) = 71
rt_sigprocmask(SIG_BLOCK, [INT QUIT ALRM TERM CHLD], [], 8) = 0
pselect6(7, [6], NULL, NULL, {0, 0}, {[], 8}) = 0 (Timeout)
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
sendto(7, "<179>Apr 21 14:56:46 sync_client"..., 103, MSG_NOSIGNAL, 
NULL, 0) = 103
write(6, 
"\27\3\3\0(\372\222\265gb\346\246%\216\222#\336\16t\320\236|bv\25\261)6/\242\217l"..., 
45) = 45
sendto(7, "<179>Apr 21 14:56:46 sync_client"..., 101, MSG_NOSIGNAL, 
NULL, 0) = 101
sendto(7, "<179>Apr 21 14:56:46 sync_client"..., 86, MSG_NOSIGNAL, NULL, 
0) = 86
sendto(7, "<179>Apr 21 14:56:46 sync_client"..., 113, MSG_NOSIGNAL, 
NULL, 0) = 113
close(8)                                = 0
write(6, 
"\27\3\3\0\"\372\222\265gb\346\246&\261B\342|\260\231\217\307[\16\\\335\211\324w\345\2544\""..., 
39) = 39
shutdown(6, SHUT_RD)                    = 0
close(6)                                = 0
close(5)                                = 0
munmap(0x7fc6ed5bb000, 16384)           = 0
close(3)                                = 0
munmap(0x7fc6ed5d0000, 49152)           = 0
close(4)                                = 0
exit_group(0)                           = ?
+++ exited with 0 +++

On 2020-04-21 15:00, Michael Menge wrote:
>
> Quoting Olaf Frączyk <olaf at navi.pl>:
>
>> Thank you for the telemetry hint :)
>>
>> I don't use the syncserver - the replication is done via IMAP port on 
>> the replica side. I have no idea how to have strace spawned by cyrus 
>> master process. When I attach later to imapd using strace -p I'm 
>> afraid some info already will be lost.
>>
>
> You can try "/usr/bin/strace <strace_option> <path_to_imapd>/imapd 
> <imapd_options>" as cmd
> or you can use prefork=1 in the service to prefok one imap process and 
> connect
> to this one before you start the sync_client.
>
>
>
>> The syncserver is marked as deprecated in the docs, so I went with 
>> the more modern option. Maybe here is the problem ;)
>>
>
> i am using syncserver since cyrus-imapd 2.3. So I had no reason to 
> change it.
>
>
>> The funny thing is that from time to time the replication progresses 
>> a little. I don't like non-repetitive behavior ;)
>>
>> Thanks again for the hints.
>>
>> On 2020-04-21 14:13, Michael Menge wrote:
>>> Hi,
>>>
>>> Quoting Olaf Frączyk <olaf at navi.pl>:
>>>
>>>> The current situation is:
>>>>
>>>> 1. Replica:
>>>>
>>>> stopped and started the replica
>>>>
>>>> no activity on replica - iotop and top show nothing
>>>>
>>>> the only messages on replica is incoming connection from master
>>>>
>>>> 2. Master:
>>>>
>>>> when I run sync_client -r I still get:
>>>>
>>>> Apr 21 12:38:36 ifs sync_client[29518]: Reprocessing sync log file 
>>>> /var/lib/imap/sync/log-run
>>>> Apr 21 12:43:27 ifs sync_client[29518]: IOERROR: zero length 
>>>> response to MAILBOXES (idle for too long)
>>>> Apr 21 12:43:27 ifs sync_client[29518]: IOERROR: zero length 
>>>> response to RESTART (idle for too long)
>>>> Apr 21 12:43:27 ifs sync_client[29518]: Error in do_sync(): bailing 
>>>> out! Bad protocol
>>>> Apr 21 12:43:27 ifs sync_client[29518]: Processing sync log file 
>>>> /var/lib/imap/sync/log-run failed: Bad protocol
>>>>
>>>> 3. There is 27 GB of about 45 GB replicated and there is no further 
>>>> progress
>>>>
>>>> 4. How to find out why the replica doesn't respond?
>>>>
>>>
>>> You can enable telemetry logging on the replica by creating a folder 
>>> /var/lib/imap/log/<SYNC_AUTHNAME>
>>> where <SYNC_AUTHNAME> is the value of the sync_authname. If you give 
>>> cyrus write permissions for this folder
>>> it will create log-files for each process and what it received and 
>>> send from/to the sync_client with timestamps.
>>>
>>> Also you can try to use strace on the syncserver process to figure 
>>> out which files it is accessing
>>>
>>>
>>>> On 2020-04-21 11:18, Olaf Frączyk wrote:
>>>>> I also found out that when I see on master:
>>>>>
>>>>> Apr 21 11:12:38 ifs sync_client[27996]: IOERROR: zero length 
>>>>> response to MAILBOXES (idle for too long)
>>>>> Apr 21 11:12:38 ifs sync_client[27996]: IOERROR: zero length 
>>>>> response to RESTART (idle for too long)
>>>>> Apr 21 11:12:38 ifs sync_client[27996]: Error in do_sync(): 
>>>>> bailing out! Bad protocol
>>>>> Apr 21 11:12:38 ifs sync_client[27996]: Processing sync log file 
>>>>> /var/lib/imap/sync/log-run failed: Bad protocol
>>>>>
>>>>> I also get on the replica:
>>>>>
>>>>> Apr 21 11:12:38 skink1 imap[5775]: Connection reset by peer, 
>>>>> closing connection
>>>>>
>>>>> But I also see, that before it happens, there is no activity both 
>>>>> on the replica and on the master for some time.
>>>>>
>>>>> So maybe the imap server process is not recovering correctly in 
>>>>> the longlock situation?
>>>>>
>>>>> On 2020-04-21 11:07, Olaf Frączyk wrote:
>>>>>> Hi,
>>>>>>
>>>>>> When I run sync_client -r on the master I see the following on 
>>>>>> the replica:
>>>>>>
>>>>>> Apr 21 10:56:15 skink1 imap[5775]: mailbox: longlock 
>>>>>> navi.pl!user.olaf for 1.7 seconds
>>>>>> Apr 21 10:56:20 skink1 imap[5775]: mailbox: longlock 
>>>>>> navi.pl!user.piotr for 2.0 seconds
>>>>>> Apr 21 10:56:23 skink1 imap[5775]: mailbox: longlock 
>>>>>> navi.pl!user.olaf for 2.9 seconds
>>>>>> Apr 21 10:56:26 skink1 imap[5775]: mailbox: longlock 
>>>>>> navi.pl!user.piotr for 3.0 seconds
>>>>>>
>>>>>> The mailboxes have several GB in Inbox folder and several GB in 
>>>>>> subfolders. The inbox folders have about 20,000 messages, the 
>>>>>> subfolders upto 15,000
>>>>>>
>>>>>> Could it cause problems?
>>>
>>> the longlock is normaly not the problem. While on process has the lock
>>> no other process can write to the mailbox, but on the replica there 
>>> normaly
>>> is no other process that should access the mailbox
>>>
>>>>>>
>>>>>> Maybe I should move the sync_client from START section to 
>>>>>> SERVICES, it seems that it is not automatically restarted
>>>>>>
>>> I havend tried starting sync_client in the service section.
>>> I start the sync_client via systemd.
>>>
>>>>>> On 2020-04-21 08:47, Michael Menge wrote:
>>>>>>> Hi Olaf
>>>>>>>
>>>>>>>
>>>>>>> Quoting Olaf Frączyk <olaf at navi.pl>:
>>>>>>>
>>>>>>>> Hi,
>>>>>>>>
>>>>>>>> I upgraded to 3.0.13 but it didn't help.
>>>>>>>>
>>>>>>>> This time it copied about 18GB
>>>>>>>>
>>>>>>>> in the logs I still see:
>>>>>>>>
>>>>>>>> 1 - inefficient replication
>>>>>>>>
>>>>>>>> 2 - IOERROR: zero length response to MAILBOXES (idle for too long)
>>>>>>>> IOERROR: zero length response to RESTART (idle for too long)
>>>>>>>> Error in do_sync(): bailing out! Bad protocol
>>>>>>>>
>>>>>>>> But I have no idea what can I do next and why it fails
>>>>>>>>
>>>>>>>> Apr 21 02:24:46 ifs sync_client[12656]: IOERROR: zero length 
>>>>>>>> response to MAILBOXES (idle for too long)
>>>>>>>> Apr 21 02:24:46 ifs sync_client[12656]: IOERROR: zero length 
>>>>>>>> response to RESTART (idle for too long)
>>>>>>>> Apr 21 02:24:46 ifs sync_client[12656]: Error in do_sync(): 
>>>>>>>> bailing out! Bad protocol
>>>>>>>
>>>>>>> do you see any errors on the syncserver side. The error look 
>>>>>>> like the
>>>>>>> sync_client is waiting for a reply of the server.
>>>
>>>
>>> -------------------------------------------------------------------------------- 
>>> M.Menge                                Tel.: (49) 7071/29-70316
>>> Universität Tübingen                   Fax.: (49) 7071/29-5912
>>> Zentrum für Datenverarbeitung          mail: 
>>> michael.menge at zdv.uni-tuebingen.de
>>> Wächterstraße 76
>>> 72074 Tübingen
>>>
>>> ----
>>> Cyrus Home Page: http://www.cyrusimap.org/
>>> List Archives/Info: http://lists.andrew.cmu.edu/pipermail/info-cyrus/
>>> To Unsubscribe:
>>> https://lists.andrew.cmu.edu/mailman/listinfo/info-cyrus
>> ----
>> Cyrus Home Page: http://www.cyrusimap.org/
>> List Archives/Info: http://lists.andrew.cmu.edu/pipermail/info-cyrus/
>> To Unsubscribe:
>> https://lists.andrew.cmu.edu/mailman/listinfo/info-cyrus
>
>
>
> -------------------------------------------------------------------------------- 
>
> M.Menge                                Tel.: (49) 7071/29-70316
> Universität Tübingen                   Fax.: (49) 7071/29-5912
> Zentrum für Datenverarbeitung          mail: 
> michael.menge at zdv.uni-tuebingen.de
> Wächterstraße 76
> 72074 Tübingen
>
> ----
> Cyrus Home Page: http://www.cyrusimap.org/
> List Archives/Info: http://lists.andrew.cmu.edu/pipermail/info-cyrus/
> To Unsubscribe:
> https://lists.andrew.cmu.edu/mailman/listinfo/info-cyrus


More information about the Info-cyrus mailing list