super.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286

/*
 * Copyright (C) 2015-2016 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <assert.h>
#include <inttypes.h>
#include <lk/compiler.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>

#ifndef LOCAL_TRACE
#define LOCAL_TRACE TRACE_LEVEL_INIT
#endif
#ifndef LOCAL_TRACE_ERR
#define LOCAL_TRACE_ERR TRACE_LEVEL_INIT
#endif

#include "array.h"
#include "block_allocator.h"
#include "block_cache.h"
#include "block_set.h"
#include "checkpoint.h"
#include "debug.h"
#include "error_reporting.h"
#include "file.h"
#include "fs.h"
#include "transaction.h"

#define SUPER_BLOCK_MAGIC (0x0073797473757274ULL) /* trustys */
#define SUPER_BLOCK_FLAGS_VERSION_MASK (0x3U)
#define SUPER_BLOCK_FLAGS_BLOCK_INDEX_MASK (0x1U)
#define SUPER_BLOCK_FLAGS_EMPTY (0x4U)
#define SUPER_BLOCK_FLAGS_ALTERNATE (0x8U)
#define SUPER_BLOCK_FLAGS_SUPPORTED_MASK (0xfU)
#define SUPER_BLOCK_FS_VERSION (0U)

/**
 * typedef super_block_opt_flags8_t - Optional flags, can be ORed together
 *
 * %SUPER_BLOCK_OPT_FLAGS_HAS_FLAGS3
 *   Indicates that the superblock has additional data after flags2 and that
 *   flags3 should be set to the same value as flags
 * %SUPER_BLOCK_OPT_FLAGS_HAS_CHECKPOINT
 *   Indicates that the superblock contains the @checkpoint field
 * %SUPER_BLOCK_OPT_FLAGS_NEEDS_FULL_SCAN
 *   An error was detected in this file system, a full scan and possibly repair
 *   should be initiated on the next mount. Reset after scanning.
 */
typedef uint8_t super_block_opt_flags8_t;
#define SUPER_BLOCK_OPT_FLAGS_HAS_FLAGS3 (0x1U)
#define SUPER_BLOCK_OPT_FLAGS_HAS_CHECKPOINT (0x2U)
#define SUPER_BLOCK_OPT_FLAGS_NEEDS_FULL_SCAN (0x4U)

/**
 * typedef super_block_required_flags16_t - Required FS flags, can be ORed
 *                                          together
 *
 * These flags are required to be supported by the current implementation; if
 * any unrecognized flag bits are set the file system must not be mounted.
 * Versions of the storage service prior to the addition of the @required_flags
 * field will interpret non-zero flags as a high @fs_version and will refuse to
 * mount the file-system.
 *
 * %SUPER_BLOCK_REQUIRED_FLAGS_MAIN_REPAIRED
 *   Indicates that the main (i.e. flags does not contain
 *   %SUPER_BLOCK_FLAGS_ALTERNATE) file system has been repaired in a manner
 *   that effectively resulted in rollback to a previous state since it was last
 *   cleared. This flag is required to be supported, if set, so that we do not
 *   discard a repaired state by running an older version of the storage
 *   service. This flag is cleared when the main file system is cleared, and
 *   therefore only tracks repairs since the file system was last cleared.
 * %SUPER_BLOCK_REQUIRED_FLAGS_MASK
 *   Mask of bits that are understood by the current storage implementation. If
 *   any bits of this field are set outside of this mask, do not mount the file
 *   system.
 */
typedef uint16_t super_block_required_flags16_t;
#define SUPER_BLOCK_REQUIRED_FLAGS_MAIN_REPAIRED (0x1U)
#define SUPER_BLOCK_REQUIRED_FLAGS_MASK \
    (SUPER_BLOCK_REQUIRED_FLAGS_MAIN_REPAIRED)

/**
 * struct super_block - On-disk root block for file system state
 * @iv:             Initial value used for encrypt/decrypt.
 * @magic:          SUPER_BLOCK_MAGIC.
 * @flags:          Version in bottom two bits, other bits are reserved.
 * @fs_version:     Required file system version. If greater than
 *                  %SUPER_BLOCK_FS_VERSION, do not mount or overwrite
 *                  filesystem.
 * @required_flags: Required file system flags. To mount this file system, any
 *                  non-zero flag bits set must be supported by the storage
 *                  implementation.
 * @block_size:     Block size of file system.
 * @block_num_size: Number of bytes used to store block numbers.
 * @mac_size:       number of bytes used to store mac values.
 * @opt_flags:      Optional flags, any of &typedef super_block_opt_flags8_t
 *                  ORed together.
 * @res2:           Reserved for future use. Write 0, read ignore.
 * @block_count:    Size of file system.
 * @free:           Block and mac of free set root node.
 * @free_count:     Currently unused.
 * @files:          Block and mac of files tree root node.
 * @res3:           Reserved for future use. Write 0, read ignore.
 * @flags2:         Copy of @flags. Allows storing the super-block in a device
 *                  that does not support an atomic write of the entire
 *                  super-block.
 * @backup:         Backup of previous super-block, used to support an alternate
 *                  backing store. 0 if no backup has ever been written. Once a
 *                  backup exists, it will only ever be swapped, not cleared.
 * @checkpoint:     Block and mac of checkpoint metadata block. 0 if a
 *                  checkpoint does not exist.
 * @res4:           Reserved for future use. Write 0, read ignore.
 * @flags3:         Copy of @flags. Allows storing the super-block in a device
 *                  that does not support an atomic write of the entire
 *                  super-block. If SUPER_BLOCK_OPT_FLAGS_HAS_FLAGS3 is not set,
 *                  @flags3 is not checked and fields after @flags2 are ignored.
 *
 * Block numbers and macs in @free and @files are packed as indicated by
 * @block_num_size and @mac_size, but unlike other on-disk data, the size of the
 * whole field is always the full 24 bytes needed for a 8 byte block number and
 * 16 byte mac This allows the @flags2 and @flags3 to be validated before
 * knowing @block_num_size and @mac_size.
 */
struct super_block {
    struct iv iv;
    uint64_t magic;
    uint32_t flags;
    uint16_t fs_version;
    super_block_required_flags16_t required_flags;
    uint32_t block_size;
    uint8_t block_num_size;
    uint8_t mac_size;
    super_block_opt_flags8_t opt_flags;
    uint8_t res2;
    data_block_t block_count;
    struct block_mac free;
    data_block_t free_count;
    struct block_mac files;
    uint32_t res3[5];
    uint32_t flags2;
    struct super_block_backup backup;
    struct block_mac checkpoint;
    uint32_t res4[6];
    uint32_t flags3;
};
STATIC_ASSERT(offsetof(struct super_block, flags2) == 124);
STATIC_ASSERT(offsetof(struct super_block, flags3) == 252);
STATIC_ASSERT(sizeof(struct super_block) == 256);

/*
 * We rely on these offsets in future_fs_version_test and
 * unknown_required_flags_test in the storage_block_test to test that we will
 * not mount or modify a super block with unknown version or fs flags.
 */
STATIC_ASSERT(offsetof(struct super_block, fs_version) == 28);
STATIC_ASSERT(offsetof(struct super_block, required_flags) == 30);

/* block_device_tipc.c ensures that we have at least 256 bytes in RPMB blocks */
STATIC_ASSERT(sizeof(struct super_block) <= 256);

static struct list_node fs_list = LIST_INITIAL_VALUE(fs_list);

/**
 * update_super_block_internal - Generate and write superblock
 * @tr:         Transaction object.
 * @free:       New free root.
 * @files:      New files root.
 * @checkpoint: New checkpoint metadata block.
 * @pinned:     New block should not be reused in the block cache until
 *              it is successfully written.
 *
 * Return: %true if super block was updated (in cache), %false if transaction
 * failed before super block was updated.
 */
static bool update_super_block_internal(struct transaction* tr,
                                        const struct block_mac* free,
                                        const struct block_mac* files,
                                        const struct block_mac* checkpoint,
                                        bool pinned) {
    struct super_block* super_rw;
    struct obj_ref super_ref = OBJ_REF_INITIAL_VALUE(super_ref);
    unsigned int ver;
    unsigned int index;
    super_block_required_flags16_t required_flags = 0;
    uint32_t flags;
    uint32_t block_size = tr->fs->super_dev->block_size;
    super_block_opt_flags8_t opt_flags = SUPER_BLOCK_OPT_FLAGS_HAS_FLAGS3 |
                                         SUPER_BLOCK_OPT_FLAGS_HAS_CHECKPOINT;

    if (!tr->fs->writable) {
        pr_err("Attempting to write superblock for read-only filesystem\n");
        if (!tr->failed) {
            transaction_fail(tr);
        }
        return false;
    }

    assert(block_size >= sizeof(struct super_block));
    assert(tr->fs->initial_super_block_tr == NULL ||
           tr->fs->initial_super_block_tr == tr);

    ver = (tr->fs->super_block_version + 1) & SUPER_BLOCK_FLAGS_VERSION_MASK;
    index = ver & SUPER_BLOCK_FLAGS_BLOCK_INDEX_MASK;
    flags = ver;
    if (!free && !files) {
        /*
         * If the free and files trees are not provided, the filesystem is in
         * the initial empty state.
         */
        flags |= SUPER_BLOCK_FLAGS_EMPTY;
    } else {
        /* Non-empty filesystems must have both trees (with root node blocks) */
        assert(free);
        assert(files);
    }
    if (tr->fs->alternate_data) {
        flags |= SUPER_BLOCK_FLAGS_ALTERNATE;
    }
    if (tr->repaired || tr->fs->main_repaired) {
        /*
         * We don't track repairs in alternate data mode, so we shouldn't do
         * them - ensure the transaction does not include a repair if we are in
         * alternate state. The FS flag is used to persist the state for the
         * main FS.
         */
        assert(!tr->repaired || !tr->fs->alternate_data);
        required_flags |= SUPER_BLOCK_REQUIRED_FLAGS_MAIN_REPAIRED;
        /*
         * TODO: We would like to track the number of repairs in addition to the
         * current repair state. This may be up to three different counters: 1)
         * the number of times this fs has been repaired over the device
         * lifetime to report in metrics, 2) the number of repairs since last
         * clear, and 3) the overall fs generation count (number of device
         * lifetime repairs+clears). 2) and 3) would primarily be useful if we
         * expose them to clients via a new query API, while 1) would mostly be
         * for device metrics. We can implement some or all of these counters
         * when we add an API that consumes them.
         */
    }
    if (tr->fs->needs_full_scan) {
        opt_flags |= SUPER_BLOCK_OPT_FLAGS_NEEDS_FULL_SCAN;
    }

    pr_write("write super block %" PRIu64 ", ver %d\n",
             tr->fs->super_block[index], ver);

    super_rw = block_get_cleared_super(tr, tr->fs->super_block[index],
                                       &super_ref, pinned);
    if (tr->failed) {
        block_put_dirty_discard(super_rw, &super_ref);
        return false;
    }
    super_rw->magic = SUPER_BLOCK_MAGIC;
    super_rw->flags = flags;
    /* TODO: keep existing fs version when possible */
    super_rw->fs_version = SUPER_BLOCK_FS_VERSION;
    super_rw->required_flags = required_flags;
    super_rw->block_size = tr->fs->dev->block_size;
    super_rw->block_num_size = tr->fs->block_num_size;
    super_rw->mac_size = tr->fs->mac_size;
    super_rw->opt_flags = opt_flags;
    super_rw->block_count = tr->fs->dev->block_count;
    if (free) {
        super_rw->free = *free;
    }
    super_rw->free_count = 0; /* TODO: remove or update */
    if (files) {
        super_rw->files = *files;
    }
    if (checkpoint) {
        super_rw->checkpoint = *checkpoint;
    }
    super_rw->flags2 = flags;
    super_rw->backup = tr->fs->backup;
    super_rw->flags3 = flags;
    tr->fs->written_super_block_version = ver;

    block_put_dirty_no_mac(super_rw, &super_ref, tr->fs->allow_tampering);

    return true;
}

/**
 * update_super_block - Generate and write superblock
 * @tr:         Transaction object.
 * @free:       New free root.
 * @files:      New files root.
 * @checkpoint: New checkpoint metadata block.
 *
 * Return: %true if super block was updated (in cache), %false if transaction
 * failed before super block was updated.
 */
bool update_super_block(struct transaction* tr,
                        const struct block_mac* free,
                        const struct block_mac* files,
                        const struct block_mac* checkpoint) {
    return update_super_block_internal(tr, free, files, checkpoint, false);
}

/**
 * write_initial_super_block - Write initial superblock to internal transaction
 * @fs:         File system state object.
 *
 * When needed, this must be called before creating any other transactions on
 * this filesystem so we don't fill up the cache with entries that can't be
 * flushed to make room for this block.
 *
 * Return: %true if the initial empty superblock was successfully written to the
 * cache, or %false otherwise.
 */
static bool write_initial_super_block(struct fs* fs) {
    struct transaction* tr;
    tr = calloc(1, sizeof(*tr));
    if (!tr) {
        return false;
    }
    fs->initial_super_block_tr = tr;

    transaction_init(tr, fs, true);
    return update_super_block_internal(tr, NULL, NULL, NULL, true);
}

/**
 * write_current_super_block - Write current superblock to internal transaction
 * @fs:           File system state object.
 * @reinitialize: Allow the special transaction to be reinitialized if it has
 *                failed
 *
 * Write the current state of the super block to an internal transaction that
 * will be written before any other block. This can be used to re-sync the
 * in-memory fs-state with the on-disk state after detecting a write failure
 * where no longer know the on-disk super block state.
 *
 * @fs must be writable when calling this function.
 */
void write_current_super_block(struct fs* fs, bool reinitialize) {
    bool super_block_updated;
    struct transaction* tr;

    assert(fs->writable);

    if (fs->initial_super_block_tr) {
        /*
         * If initial_super_block_tr is already pending and not failed there is
         * no need to allocate a new one so return early.
         *
         * If the special transaction has failed, we need to re-initialize it so
         * that we can attempt to recover to a good state.
         *
         * We are only allowed to reinitialze if the @reinitialize parameter is
         * true. We don't want to allow reinitialization while cleaning blocks
         * (i.e. via fs_unknown_super_block_state_all()), as this would reset
         * the special transaction to non-failed state and create a situation
         * where transaction_initial_super_block_complete() cannot know if it
         * successfully flushed the special transaction to disk. Therefore we
         * only allow transaction_initial_super_block_complete() to reinitialize
         * a failed special transaction after it attempts and fails to write the
         * block to disk.
         *
         * Since we pin special superblock entries in the block cache and
         * therefore cannot evict them with normal transactions,
         * transaction_initial_super_block_complete() is the only place we can
         * attempt a special transaction write, and if it fails the transaction
         * is immediately reinitialized. Therefore we should only ever be in a
         * failed state if reinitialize is true (i.e. we are being called from
         * transaction_initial_super_block_complete()).
         */

        assert(reinitialize || !fs->initial_super_block_tr->failed);
        if (!fs->initial_super_block_tr->failed || !reinitialize) {
            return;
        }

        tr = fs->initial_super_block_tr;
        transaction_activate(tr);
    } else {
        tr = calloc(1, sizeof(*tr));
        if (!tr) {
            /* Not safe to proceed. TODO: add flag to defer this allocation? */
            abort();
        }
        transaction_init(tr, fs, true);
        fs->initial_super_block_tr = tr;
    }

    /*
     * Until the filesystem contains committed data, fs->free.block_tree.root
     * will be zero, i.e. an invalid block mac. fs->free.block_tree.root is only
     * updated in transaction_complete() after successfully writing a new
     * superblock. If the filesystem is empty, we need to emit a cleared
     * superblock with a special flag to prevent the superblock state from
     * getting out of sync with the filesystem data if a reboot occurrs before
     * committing a superblock with data.
     *
     * We can't use fs->files.root here because it may be invalid if there are
     * no files in the filesystem. If the free node is zero, then the files node
     * must be as well, so we assert this.
     */
    bool fs_is_cleared = !block_mac_valid(tr, &fs->free.block_tree.root);
    if (fs_is_cleared) {
        assert(!block_mac_valid(tr, &fs->files.root));
        super_block_updated =
                update_super_block_internal(tr, NULL, NULL, NULL, true);
    } else {
        super_block_updated = update_super_block_internal(
                tr, &fs->free.block_tree.root, &fs->files.root, &fs->checkpoint,
                true);
    }
    if (!super_block_updated) {
        /* Not safe to proceed. TODO: add flag to try again? */
        fprintf(stderr,
                "Could not create pending write for current superblock state. "
                "Not safe to proceed.\n");
        abort();
    }
}

/**
 * fs_mark_scan_required - Require a full scan for invalid blocks the next time
 *                         this FS is mounted
 * @fs:             File system object
 *
 * Marks the file system to require a full scan (and possibly repair) on the
 * next mount. If @fs is writable, this function immediately writes a new copy
 * of the current super block, so the flag will persist even with no further
 * writes to the file system.
 */
void fs_mark_scan_required(struct fs* fs) {
    fs->needs_full_scan = true;
    if (!fs->writable) {
        /* We can't write back the superblock until this FS is writable. */
        return;
    }
    write_current_super_block(fs, false);
    assert(fs->initial_super_block_tr);
    transaction_initial_super_block_complete(fs->initial_super_block_tr);
}

/**
 * super_block_valid - Check if superblock is valid
 * @dev:        Block device that supoer block was read from.
 * @super:      Super block data.
 *
 * Return: %true if @super is valid for @dev, %false otherwise.
 */
static bool super_block_valid(const struct block_device* dev,
                              const struct super_block* super) {
    if (super->magic != SUPER_BLOCK_MAGIC) {
        pr_init("bad magic, 0x%" PRIx64 "\n", super->magic);
        return false;
    }
    if (super->flags != super->flags2) {
        pr_warn("flags, 0x%x, does not match flags2, 0x%x\n", super->flags,
                super->flags2);
        return false;
    }
    if ((super->opt_flags & SUPER_BLOCK_OPT_FLAGS_HAS_FLAGS3) &&
        super->flags != super->flags3) {
        pr_warn("flags, 0x%x, does not match flags3, 0x%x\n", super->flags,
                super->flags3);
        return false;
    }
    if (super->fs_version > SUPER_BLOCK_FS_VERSION) {
        pr_warn("super block is from the future: 0x%x\n", super->fs_version);
        return true;
    }
    if (super->flags & ~SUPER_BLOCK_FLAGS_SUPPORTED_MASK) {
        pr_warn("unknown flags set, 0x%x\n", super->flags);
        return false;
    }
    if (super->block_size != dev->block_size) {
        pr_warn("bad block size 0x%x, expected 0x%zx\n", super->block_size,
                dev->block_size);
        return false;
    }
    if (super->block_num_size != dev->block_num_size) {
        pr_warn("invalid block_num_size %d, expected %zd\n",
                super->block_num_size, dev->block_num_size);
        return false;
    }
    if (super->mac_size != dev->mac_size) {
        pr_warn("invalid mac_size %d, expected %zd\n", super->mac_size,
                dev->mac_size);
        return false;
    }
    if (!dev->tamper_detecting && super->mac_size != sizeof(struct mac)) {
        pr_warn("invalid mac_size %d != %zd\n", super->mac_size,
                sizeof(data_block_t));
        return false;
    }

    return true;
}

/**
 * super_version_delta - Find the version delta between two superblocks
 * @new_super: Candidate new superblock
 * @old_super: Old superblock
 *
 * The overflow in this function is intentional as a way to use a wrapping
 * two-bit counter.
 *
 * Return: Wrapped difference between the two bit version numbers in the two
 * superblocks. This will be 1 when new is newer than old, 3 when old is
 * newer than new, and any other number indicates an invalid/corrupt version.
 */
__attribute__((no_sanitize("unsigned-integer-overflow"))) static inline uint8_t
super_version_delta(const struct super_block* new_super,
                    const struct super_block* old_super) {
    return (new_super->flags - old_super->flags) &
           SUPER_BLOCK_FLAGS_VERSION_MASK;
}

/**
 * use_new_super - Check if new superblock is valid and more recent than old
 * @dev:                Block device that super block was read from.
 * @new_super:          New super block data.
 * @new_super_index:    Index that @new_super was read from.
 * @old_super:          Old super block data, or %NULL.
 *
 * Return: %true if @new_super is valid for @dev, and more recent than
 * @old_super (or @old_super is %NULL), %false otherwise.
 */
static bool use_new_super(const struct block_device* dev,
                          const struct super_block* new_super,
                          unsigned int new_super_index,
                          const struct super_block* old_super) {
    uint8_t dv;
    if (!super_block_valid(dev, new_super)) {
        return false;
    }
    if ((new_super->flags & SUPER_BLOCK_FLAGS_BLOCK_INDEX_MASK) !=
        new_super_index) {
        pr_warn("block index, 0x%x, does not match flags, 0x%x\n",
                new_super_index, new_super->flags);
        return false;
    }
    if (!old_super) {
        return true;
    }
    dv = super_version_delta(new_super, old_super);
    pr_read("version delta, %d (new flags 0x%x, old flags 0x%x)\n", dv,
            new_super->flags, old_super->flags);
    if (dv == 1) {
        return true;
    }
    if (dv == 3) {
        return false;
    }
    pr_warn("bad version delta, %d (new flags 0x%x, old flags 0x%x)\n", dv,
            new_super->flags, old_super->flags);
    return false;
}

static void fs_init_free_set(struct fs* fs, struct block_set* set);

/**
 * fs_set_roots - Initialize fs state from super block roots
 * @fs:                File system state object
 * @free:              Free set root node
 * @files:             Files tree root node
 * @checkpoint:        Checkpoint metadata block. May be NULL.
 * @restore_checkpoint: If %true, restore files and free roots from @checkpoint
 *                      (which must not be NULL).
 *
 * Unconditionally sets the filesystem roots to @free and @files respectively,
 * then attempts to restore the checkpoint roots if @restore_checkpoint is
 * %true. When attempting to restore from a checkpoint that exists but is not
 * readable, return %false, leaving the filesystem roots initialized to @free
 * and @files. If attempting to restore from checkpoint but no checkpoint was
 * previously set, this function will clear the filesystem.
 *
 * Returns %true if fs roots were correctly initialized as requested, %false if
 * a requested checkpoint restore failed (but roots were still initialized to
 * the provided blocks).
 */
static bool fs_set_roots(struct fs* fs,
                         const struct block_mac* free,
                         const struct block_mac* files,
                         const struct block_mac* checkpoint,
                         bool restore_checkpoint) {
    bool success = true;
    struct transaction tr;
    struct block_tree checkpoint_files =
            BLOCK_TREE_INITIAL_VALUE(checkpoint_files);

    assert(!restore_checkpoint || checkpoint);

    fs->free.block_tree.root = *free;
    fs->files.root = *files;

    if (checkpoint) {
        fs->checkpoint = *checkpoint;
        transaction_init(&tr, fs, true);

        /*
         * fs->checkpoint_free is initialized to contain all blocks, so we
         * don't have to initialize it if there is no checkpoint on disk
         */
        assert(!block_range_empty(fs->checkpoint_free.initial_range));

        if (block_mac_valid(&tr, &fs->checkpoint)) {
            success = checkpoint_read(&tr, &fs->checkpoint, &checkpoint_files,
                                      &fs->checkpoint_free);
        } else if (restore_checkpoint) {
            /* We do not want to restore a non-existent checkpoint */
            success = false;
        }
        if (success && restore_checkpoint) {
            /*
             * Checkpoint restore counts as a repair which must set the repaired
             * flag. We disallow checkpoint restore in alternate mode in
             * fs_init().
             */
            fs->main_repaired = true;
            fs->files.root = checkpoint_files.root;
            block_set_copy_ro(&tr, &fs->free, &fs->checkpoint_free);
            /*
             * block_set_copy_ro() clears the copy_on_write flag for the free
             * set, so we have to reset it to allow modification.
             */
            fs->free.block_tree.copy_on_write = true;
        }
        if (!tr.failed) {
            /* temporary transaction is only for reading, drop it */
            transaction_fail(&tr);
        }
        transaction_free(&tr);
    }

    return success;
}

/**
 * fs_init_free_set - Initialize an initial free set for a file system
 * @fs:         File system state object.
 * @set:        Block set to initialize
 *
 * Initializes @set to the entire range of @fs, i.e. all blocks are free.
 */
static void fs_init_free_set(struct fs* fs, struct block_set* set) {
    struct block_range range = {
            .start = fs->min_block_num,
            .end = fs->dev->block_count,
    };
    block_set_add_initial_range(set, range);
}

/**
 * fs_init_from_super - Initialize file system from super block
 * @fs:         File system state object.
 * @super:      Superblock data, or %NULL.
 * @flags:      Any of &typedef fs_init_flags32_t, ORed together.
 *
 * Return: 0 if super block was usable, -1 if a fatal error was encountered and
 * initialization should not continue. The file system may not be readable, even
 * if this function returns 0. Check @fs->readable before attempting to read
 * from this file system.
 */
static int fs_init_from_super(struct fs* fs,
                              const struct super_block* super,
                              fs_init_flags32_t flags) {
    bool is_clear = false;
    bool do_clear = flags & FS_INIT_FLAGS_DO_CLEAR;
    bool do_swap = false; /* Does the active superblock alternate mode match the
                             current mode? */
    bool do_clear_backup = false;
    bool has_backup_field =
            super && (super->opt_flags & SUPER_BLOCK_OPT_FLAGS_HAS_FLAGS3);
    bool has_checkpoint_field =
            has_backup_field && super &&
            (super->opt_flags & SUPER_BLOCK_OPT_FLAGS_HAS_CHECKPOINT);
    bool recovery_allowed = flags & FS_INIT_FLAGS_RECOVERY_CLEAR_ALLOWED;
    bool read_only = false;
    const struct block_mac* new_files_root;
    const struct block_mac* new_free_root;
    const struct block_mac* new_checkpoint = NULL;

    /*
     * We check that the super-block matches these block device params in
     * super_block_valid(). If these params change, the filesystem (and
     * alternate backup) will be wiped and reset with the new params.
     */
    fs->block_num_size = fs->dev->block_num_size;
    fs->mac_size = fs->dev->mac_size;

    block_set_init(fs, &fs->free);
    fs->free.block_tree.copy_on_write = true;
    fs_file_tree_init(fs, &fs->files);
    fs->files.copy_on_write = true;
    fs->files.allow_copy_on_write = true;
    fs->main_repaired = false;

    memset(&fs->checkpoint, 0, sizeof(fs->checkpoint));
    block_set_init(fs, &fs->checkpoint_free);
    /*
     * checkpoint_init() will clear the checkpoint initial range if a valid
     * checkpoint exists.
     */
    fs_init_free_set(fs, &fs->checkpoint_free);

    /* Reserve 1/4 for tmp blocks plus half of the remaining space */
    fs->reserved_count = fs->dev->block_count / 8 * 5;

    fs->alternate_data = flags & FS_INIT_FLAGS_ALTERNATE_DATA;

    /*
     * Check version and flags after initializing an empty FS, so that we can
     * disallow writing and continue initializing other file systems. If we exit
     * early here this file system will be inaccessible, but its fields are
     * safely initialized.
     */
    if (super && super->fs_version > SUPER_BLOCK_FS_VERSION) {
        pr_err("ERROR: super block is from the future 0x%x\n",
               super->fs_version);
        error_report_superblock_invalid(fs->name);
        assert(!fs->readable);
        assert(!fs->writable);
        return 0;
    }

    if (super && (super->required_flags & ~SUPER_BLOCK_REQUIRED_FLAGS_MASK)) {
        pr_err("ERROR: super block requires unrecognized fs features: 0x%x\n",
               super->required_flags);
        error_report_superblock_invalid(fs->name);
        assert(!fs->readable);
        assert(!fs->writable);
        return 0;
    }

    if (super) {
        fs->super_block_version = super->flags & SUPER_BLOCK_FLAGS_VERSION_MASK;
        fs->needs_full_scan =
                super->opt_flags & SUPER_BLOCK_OPT_FLAGS_NEEDS_FULL_SCAN;
        fs->main_repaired = super->required_flags &
                            SUPER_BLOCK_REQUIRED_FLAGS_MAIN_REPAIRED;

        do_swap = !(super->flags & SUPER_BLOCK_FLAGS_ALTERNATE) !=
                  !(flags & FS_INIT_FLAGS_ALTERNATE_DATA);

        if (do_swap) {
            pr_init("Swapping super-block with alternate\n");

            fs->backup.flags = super->flags & (SUPER_BLOCK_FLAGS_EMPTY |
                                               SUPER_BLOCK_FLAGS_ALTERNATE);
            fs->backup.free = super->free;
            fs->backup.files = super->files;
            fs->backup.checkpoint = super->checkpoint;

            if (!has_backup_field ||
                super->backup.flags & SUPER_BLOCK_FLAGS_EMPTY) {
                is_clear = true;
            } else if (has_backup_field) {
                new_files_root = &super->backup.files;
                new_free_root = &super->backup.free;
                if (has_checkpoint_field) {
                    new_checkpoint = &super->backup.checkpoint;
                }
            }
        } else {
            if (has_backup_field) {
                fs->backup = super->backup;
            }

            if (super->flags & SUPER_BLOCK_FLAGS_EMPTY) {
                is_clear = true;
            } else {
                new_files_root = &super->files;
                new_free_root = &super->free;
                if (has_checkpoint_field) {
                    new_checkpoint = &super->checkpoint;
                }
            }
        }

        if (!is_clear && !do_clear &&
            (!block_probe(fs, new_files_root, true) ||
             !block_probe(fs, new_free_root, false))) {
            pr_init("Backing file probe failed, fs is corrupted.\n");
            if (recovery_allowed) {
                pr_init("Attempting to clear corrupted fs.\n");
                do_clear = true;
            }
        }

        /*
         * Check that the block device has not shrunk. Shrinking is only allowed
         * in limited circumstances if we are also clearing the filesystem.
         */
        if (super->block_count > fs->dev->block_count) {
            if ((!do_clear) && (!is_clear)) {
                /*
                 * If block device is smaller than super and we're not clearing
                 * the fs, we want to prevent write access to avoid losing data.
                 * Read-only access is still allowed, although blocks may be
                 * missing.
                 */
                pr_err("bad block count 0x%" PRIx64 ", expected <= 0x%" PRIx64
                       "\n",
                       super->block_count, fs->dev->block_count);
                read_only = true;
            } else if (flags & FS_INIT_FLAGS_ALTERNATE_DATA) {
                /*
                 * Either we are on main filesystem and switching to alternate
                 * or we are on alternate. Either case is an error. If we get
                 * here, then the alternate FS is not backed by a temp file,
                 * which should never happen. We want to error loudly in this
                 * case, but continue mounting other file systems.
                 */
                pr_err("Can't clear fs if FS_INIT_FLAGS_ALTERNATE_DATA is"
                       " set .\n");
                assert(!fs->readable);
                assert(!fs->writable);
                return 0;
            } else {
                /*
                 * If we are are on main filesystem and the backup is an
                 * alternate, clear the backup also.
                 */
                do_clear_backup = true;
            }
        }
    }

    if (!fs->alternate_data && (flags & FS_INIT_FLAGS_RESTORE_CHECKPOINT)) {
        fs->needs_full_scan = false;
    }

    /*
     * If any of the following are true:
     * - we are initializing a new fs
     * - we are not swapping but detect an old superblock without the backup
     * - filesystem device has shrunk and FS_INIT_FLAGS_DO_CLEAR is set
     * then ensure that the backup slot is a valid empty filesystem in case we
     * later switch filesystems without an explicit clear flag.
     */
    if (!super || (!do_swap && !has_backup_field) || do_clear_backup) {
        fs->backup = (struct super_block_backup){
                .flags = SUPER_BLOCK_FLAGS_EMPTY,
                .files = {0},
                .free = {0},
                .checkpoint = {0},
        };
    }

    if (super && !is_clear && !do_clear) {
        if (!fs_set_roots(fs, new_free_root, new_files_root, new_checkpoint,
                          flags & FS_INIT_FLAGS_RESTORE_CHECKPOINT)) {
            /*
             * fs_set_roots() returns false if the checkpoint restore failed,
             * but leaves the roots in a valid state to allow read-only access.
             */
            pr_err("fs %s: failed to initialize filesystem roots\n", fs->name);
            read_only = true;
        } else {
            pr_init("fs %s: loaded super block version %d, checkpoint exists: %d\n",
                    fs->name, fs->super_block_version,
                    block_range_empty(fs->checkpoint_free.initial_range));
        }
    } else {
        if (is_clear) {
            pr_init("fs %s: superblock, version %d, is empty fs\n", fs->name,
                    fs->super_block_version);
        } else if (do_clear) {
            pr_init("fs %s: clear requested, create empty, version %d\n",
                    fs->name, fs->super_block_version);
            if (!fs->alternate_data) {
                fs->main_repaired = false;
                fs->needs_full_scan = false;
            }
        } else {
            pr_init("fs %s: no valid super-block found, create empty\n",
                    fs->name);
        }
        fs_init_free_set(fs, &fs->free);
    }
    assert(fs->block_num_size >= fs->dev->block_num_size);
    assert(fs->block_num_size <= sizeof(data_block_t));
    assert(fs->mac_size >= fs->dev->mac_size);
    assert(fs->mac_size <= sizeof(struct mac));
    assert(fs->mac_size == sizeof(struct mac) || fs->dev->tamper_detecting);

    /*
     * fs_set_roots() unconditionally set the files and free roots. If it fails,
     * it failed to read the checkpoint block but that should only block
     * modification, not reading.
     */
    fs->readable = true;

    if (read_only) {
        assert(!fs->writable);
        return 0;
    }

    fs->writable = true;
    if (do_clear && !is_clear) {
        if (!write_initial_super_block(fs)) {
            return -1;
        }
    } else if (flags & FS_INIT_FLAGS_RESTORE_CHECKPOINT) {
        /*
         * Flush the new restored checkpoint to superblock before overwriting
         * any data blocks. We know that we can't already have a pending
         * initial_super_block_tr yet because we just made the filesystem
         * writable, and write_current_super_block() requires a writable
         * filesystem.
         */
        assert(!fs->initial_super_block_tr);
        write_current_super_block(fs, false);
    }

    return 0;
}

/**
 * load_super_block - Find and load superblock and initialize file system state
 * @fs:         File system state object.
 * @flags:      Any of &typedef fs_init_flags32_t, ORed together.
 *
 * Return: 0 if super block was readable and not from a future file system
 * version (regardless of its other content), -1 if not.
 */
static int load_super_block(struct fs* fs, fs_init_flags32_t flags) {
    unsigned int i;
    int ret;
    const struct super_block* new_super;
    struct obj_ref new_super_ref = OBJ_REF_INITIAL_VALUE(new_super_ref);
    const struct super_block* old_super = NULL;
    struct obj_ref old_super_ref = OBJ_REF_INITIAL_VALUE(old_super_ref);

    assert(fs->super_dev->block_size >= sizeof(struct super_block));

    for (i = 0; i < countof(fs->super_block); i++) {
        new_super = block_get_super(fs, fs->super_block[i], &new_super_ref);
        if (!new_super) {
            if (fs->allow_tampering) {
                /*
                 * Superblock may not exist yet in non-secure storage, proceed
                 * anyway
                 */
                continue;
            }
            pr_err("failed to read super-block\n");
            ret = -1;  // -EIO ? ERR_IO?;
            goto err;
        }
        if (use_new_super(fs->dev, new_super, i, old_super)) {
            if (old_super) {
                block_put(old_super, &old_super_ref);
            }
            old_super = new_super;
            obj_ref_transfer(&old_super_ref, &new_super_ref);
        } else {
            block_put(new_super, &new_super_ref);
        }
    }

    ret = fs_init_from_super(fs, old_super, flags);
err:
    if (old_super) {
        block_put(old_super, &old_super_ref);
    }
    return ret;
}

struct fs_check_state {
    struct file_iterate_state iter;
    bool delete_invalid_files;

    bool internal_state_valid;
    bool invalid_block_found;
};

static bool fs_check_file(struct file_iterate_state* iter,
                          struct transaction* tr,
                          const struct block_mac* block_mac,
                          bool added,
                          bool removed) {
    struct fs_check_state* fs_check_state =
            containerof(iter, struct fs_check_state, iter);
    struct obj_ref info_ref = OBJ_REF_INITIAL_VALUE(info_ref);
    struct storage_file_handle file;
    char path[FS_PATH_MAX];

    assert(!tr->failed);
    assert(!tr->invalid_block_found);

    const struct file_info* info = file_get_info(tr, block_mac, &info_ref);
    if (!info) {
        pr_err("could not get file info at block %" PRIu64 "\n",
               block_mac_to_block(tr, block_mac));
        fs_check_state->internal_state_valid = false;
        goto err_file_info;
    }
    strncpy(path, info->path, sizeof(path));
    path[sizeof(path) - 1] = '\0';
    file_info_put(info, &info_ref);

    enum file_op_result result =
            file_open(tr, path, &file, FILE_OPEN_NO_CREATE, true);
    if (result != FILE_OP_SUCCESS) {
        /* TODO: is it ok to leak the filename here? we do it elsewhere */
        pr_err("could not open file %s\n", path);
        fs_check_state->internal_state_valid = false;
        goto err_file_open;
    }

    if (!file_check(tr, &file)) {
        fs_check_state->internal_state_valid = false;
    }

    file_close(&file);

err_file_open:
err_file_info:
    if (tr->invalid_block_found) {
        fs_check_state->invalid_block_found = true;
        /* We have noted the invalid block, reset for the next file. */
        tr->invalid_block_found = false;
    }
    if (tr->failed) {
        transaction_activate(tr);
    }

    /* Continue iterating unconditionally */
    return false;
}

enum fs_check_result fs_check_full(struct fs* fs) {
    bool free_set_valid, file_tree_valid;
    enum fs_check_result res = FS_CHECK_NO_ERROR;
    struct transaction iterate_tr;
    struct fs_check_state state = {
            .iter.file = fs_check_file,
            .internal_state_valid = true,
            .invalid_block_found = false,
    };

    transaction_init(&iterate_tr, fs, true);

    /* Check the free list for consistency */
    free_set_valid = block_set_check(&iterate_tr, &fs->free);
    if (!free_set_valid || iterate_tr.invalid_block_found) {
        pr_err("free block set is invalid\n");
        res = FS_CHECK_INVALID_FREE_SET;
        /*
         * We can recover the free set non-destructively by rebuilding from the
         * file tree, so we don't need to report the invalid block.
         */
        iterate_tr.invalid_block_found = false;
    }
    if (iterate_tr.failed) {
        pr_err("free set tree not fully readable\n");
        state.internal_state_valid = false;
        transaction_activate(&iterate_tr);
    }

    /* Check the file tree for consistency */
    file_tree_valid = block_tree_check(&iterate_tr, &fs->files);
    if (!file_tree_valid) {
        pr_err("file tree is invalid\n");
        res = FS_CHECK_INVALID_FILE_TREE;
    }
    if (iterate_tr.invalid_block_found) {
        pr_err("invalid block encountered in file tree\n");
        state.invalid_block_found = true;
        iterate_tr.invalid_block_found = false;
    }
    if (iterate_tr.failed) {
        pr_err("file tree not fully readable\n");
        state.internal_state_valid = false;
        transaction_activate(&iterate_tr);
    }

    file_iterate(&iterate_tr, NULL, false, &state.iter, true);

    /* Invalid blocks take precedence over internal consistency errors. */
    if (state.invalid_block_found) {
        res = FS_CHECK_INVALID_BLOCK;
    } else if (res == FS_CHECK_NO_ERROR && !state.internal_state_valid) {
        res = FS_CHECK_UNKNOWN;
    }
    if (!iterate_tr.failed) {
        transaction_fail(&iterate_tr);
    }
    transaction_free(&iterate_tr);

    return res;
}

enum fs_check_result fs_check_quick(struct fs* fs) {
    bool fs_is_clear = !block_range_empty(fs->free.initial_range);
    if (fs_is_clear || (block_probe(fs, &fs->files.root, true) &&
                        block_probe(fs, &fs->free.block_tree.root, false))) {
        return FS_CHECK_NO_ERROR;
    } else {
        return FS_CHECK_INVALID_BLOCK;
    }
}

enum fs_check_result fs_check(struct fs* fs) {
    if (fs->needs_full_scan) {
        pr_warn("%s filesystem requires full scan on mount\n", fs->name);
        return fs_check_full(fs);
    } else {
        return fs_check_quick(fs);
    }
}

/**
 * fs_file_tree_init - Initialize an empty file tree for a file system
 * @fs:        File system state object.
 * @tree:      Block tree to initialize as a file tree.
 */
void fs_file_tree_init(const struct fs* fs, struct block_tree* tree) {
    size_t block_num_size;
    size_t block_mac_size;

    block_num_size = fs->block_num_size;
    block_mac_size = block_num_size + fs->mac_size;
    block_tree_init(tree, fs->dev->block_size, block_num_size, block_mac_size,
                    block_mac_size);
}

/**
 * fs_init - Initialize file system state
 * @fs:         File system state object.
 * @name:       File system name for error reporting. Must be a static string.
 * @key:        Key pointer. Must not be freed while @fs is in use.
 * @dev:        Main block device.
 * @super_dev:  Block device for super block.
 * @flags:      Any of &typedef fs_init_flags32_t, ORed together.
 */
int fs_init(struct fs* fs,
            const char* name,
            const struct key* key,
            struct block_device* dev,
            struct block_device* super_dev,
            fs_init_flags32_t flags) {
    int ret;

    if (super_dev->block_size < sizeof(struct super_block)) {
        pr_err("unsupported block size for super_dev, %zd < %zd\n",
               super_dev->block_size, sizeof(struct super_block));
        return -1;  // ERR_NOT_VALID?
    }

    if (super_dev->block_count < 2) {
        pr_err("unsupported block count for super_dev, %" PRIu64 "\n",
               super_dev->block_count);
        return -1;  // ERR_NOT_VALID?
    }

    if ((flags & FS_INIT_FLAGS_ALTERNATE_DATA) &&
        (flags & FS_INIT_FLAGS_RESTORE_CHECKPOINT)) {
        pr_err("Alternate file system cannot restore to a checkpoint\n");
        return -1;
    }

    fs->name = name;
    fs->key = key;
    fs->dev = dev;
    fs->super_dev = super_dev;
    fs->readable = false;
    fs->writable = false;
    fs->allow_tampering = flags & FS_INIT_FLAGS_ALLOW_TAMPERING;
    fs->checkpoint_required = false;
    list_initialize(&fs->transactions);
    list_initialize(&fs->allocated);
    fs->initial_super_block_tr = NULL;
    list_add_tail(&fs_list, &fs->node);

    if (dev == super_dev) {
        fs->min_block_num = 2;
    } else {
        /* TODO: use 0 when btree code allows it */
        fs->min_block_num = 1;
    }
    fs->super_block[0] = 0;
    fs->super_block[1] = 1;
    ret = load_super_block(fs, flags);
    if (ret) {
        fs_destroy(fs);
        fs->dev = NULL;
        fs->super_dev = NULL;
        return ret;
    }

    if ((flags & FS_INIT_FLAGS_AUTO_CHECKPOINT) &&
        !block_mac_valid_fs(fs, &fs->checkpoint)) {
        if (fs_check_full(fs) == FS_CHECK_NO_ERROR) {
            fs->checkpoint_required = true;
        } else {
            pr_err("Not automatically creating a checkpoint; "
                   "an error was found in filesystem %s\n",
                   fs->name);
        }
    }

    return 0;
}

/**
 * fs_destroy - Destroy file system state
 * @fs:         File system state object.
 *
 * Free any dynamically allocated state and check that @fs is not referenced by
 * any transactions.
 */
void fs_destroy(struct fs* fs) {
    if (fs->initial_super_block_tr) {
        if (!fs->initial_super_block_tr->failed) {
            transaction_fail(fs->initial_super_block_tr);
        }
        transaction_free(fs->initial_super_block_tr);
        free(fs->initial_super_block_tr);
        fs->initial_super_block_tr = NULL;
    }
    assert(list_is_empty(&fs->transactions));
    assert(list_is_empty(&fs->allocated));
    list_delete(&fs->node);
    fs->readable = false;
    fs->writable = false;
}

/**
 * fs_unknown_super_block_state_all - Notify filesystems of unknown disk state
 *
 * Call from other layers when detecting write failues that can cause the
 * in-memory state of super blocks (or other block that we don't care about) to
 * be different from the on-disk state. Write in-memory state to disk before
 * writing any other block.
 */
void fs_unknown_super_block_state_all(void) {
    struct fs* fs;
    list_for_every_entry(&fs_list, fs, struct fs, node) {
        /* TODO: filter out filesystems that are not affected? */
        /*
         * We can't reinitialize an existing, failed special transaction here.
         * If a initial superblock write failed and triggered
         * fs_unknown_super_block_state_all() we need to leave that superblock
         * transaction in a failed state so that the transaction that that
         * triggered the failing write can also be failed further up the call
         * chain. If a special transaction already exists we are guaranteed that
         * it will be reinitialized and flushed to disk before any new writes to
         * that FS, so we don't need to reinitialize it here.
         *
         * If this file system is not writable, we should not try to re-write
         * the current super block state. A read-only file system cannot have
         * any modifications that we are allowed to save, and it does not need
         * to be re-synced here as we cannot have previously failed to write its
         * superblock.
         */
        if (fs->writable) {
            write_current_super_block(fs, false /* reinitialize */);
        }
    }
}

void fs_fail_all_transactions(void) {
    struct transaction* tmp_tr;
    struct transaction* tr;
    struct fs* fs;
    list_for_every_entry(&fs_list, fs, struct fs, node) {
        list_for_every_entry_safe(&fs->transactions, tr, tmp_tr,
                                  struct transaction, node) {
            if (transaction_is_active(tr) && !tr->failed) {
                transaction_fail(tr);
            }
        }
    }
}